From 159a274a977c496434dbc484a1b253663cde4eed Mon Sep 17 00:00:00 2001 From: Yuri Takhteyev Date: Mon, 17 Nov 2008 00:17:15 -0800 Subject: Attempting a refactoring, breaking markdown into multiple files. --- markdown.py | 2251 +---------------------------------------------------------- 1 file changed, 2 insertions(+), 2249 deletions(-) mode change 100755 => 100644 markdown.py (limited to 'markdown.py') diff --git a/markdown.py b/markdown.py old mode 100755 new mode 100644 index baf6567..7cba3a8 --- a/markdown.py +++ b/markdown.py @@ -1,2253 +1,6 @@ #!/usr/bin/env python -""" -Python Markdown -=============== -Python Markdown converts Markdown to HTML and can be used as a library or -called from the command line. - -## Basic usage as a module: - - import markdown - md = Markdown() - html = md.convert(your_text_string) - -## Basic use from the command line: - - python markdown.py source.txt > destination.html - -Run "python markdown.py --help" to see more options. - -## Extensions - -See for more -information and instructions on how to extend the functionality of -Python Markdown. Read that before you try modifying this file. - -## Authors and License - -Started by [Manfred Stienstra](http://www.dwerg.net/). Continued and -maintained by [Yuri Takhteyev](http://www.freewisdom.org), [Waylan -Limberg](http://achinghead.com/) and [Artem Yunusov](http://blog.splyer.com). - -Contact: markdown@freewisdom.org - -Copyright 2007, 2008 The Python Markdown Project (v. 1.7 and later) -Copyright 200? Django Software Foundation (OrderedDict implementation) -Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b) -Copyright 2004 Manfred Stienstra (the original version) - -License: BSD (see docs/LICENSE for details). -""" - -version = "2.0-alpha" -version_info = (2,0,0, "beta") - -import re -import sys -import codecs -import htmlentitydefs -import logging -from logging import DEBUG, INFO, WARN, ERROR, CRITICAL -from urlparse import urlparse, urlunparse - - -""" -CONSTANTS -============================================================================= -""" - -""" -Constants you might want to modify ------------------------------------------------------------------------------ -""" - -# default logging level for command-line use -COMMAND_LINE_LOGGING_LEVEL = CRITICAL -TAB_LENGTH = 4 # expand tabs to this many spaces -ENABLE_ATTRIBUTES = True # @id = xyz -> <... id="xyz"> -SMART_EMPHASIS = True # this_or_that does not become thisorthat -HTML_REMOVED_TEXT = "[HTML_REMOVED]" # text used instead of HTML in safe mode -BLOCK_LEVEL_ELEMENTS = re.compile("p|div|h[1-6]|blockquote|pre|table|dl|ol|ul" - +"|script|noscript|form|fieldset|iframe|math" - +"|ins|del|hr|hr/|style|li|dt|dd|tr") - -""" -Constants you probably do not need to change ------------------------------------------------------------------------------ -""" - -RTL_BIDI_RANGES = ( (u'\u0590', u'\u07FF'), - # Hebrew (0590-05FF), Arabic (0600-06FF), - # Syriac (0700-074F), Arabic supplement (0750-077F), - # Thaana (0780-07BF), Nko (07C0-07FF). - (u'\u2D30', u'\u2D7F'), # Tifinagh - ) - -EXECUTABLE_NAME_FOR_USAGE = "python markdown.py" -""" The name used in the usage statement displayed for python versions < 2.3. -(With python 2.3 and higher the usage statement is generated by optparse -and uses the actual name of the executable called.) """ - -# Placeholders -STX = u'\u0002' # Use STX ("Start of text") for start-of-placeholder -ETX = u'\u0003' # Use ETX ("End of text") for end-of-placeholder -HTML_PLACEHOLDER_PREFIX = STX+"wzxhzdk:" -HTML_PLACEHOLDER = HTML_PLACEHOLDER_PREFIX + "%d" + ETX -INLINE_PLACEHOLDER_PREFIX = STX+"klzzwxh:" -INLINE_PLACEHOLDER = INLINE_PLACEHOLDER_PREFIX + "%s" + ETX -AMP_SUBSTITUTE = STX+"amp"+ETX - - -""" -AUXILIARY GLOBAL FUNCTIONS -============================================================================= -""" - -def message(level, text): - """ A wrapper method for logging debug messages. """ - logging.getLogger('MARKDOWN').log(level, text) - -def isString(s): - """ Check if it's string """ - return isinstance(s, unicode) or isinstance(s, str) - -## Import -def importETree(): - """Import the best implementation of ElementTree, return a module object.""" - etree_in_c = None - try: # Is it Python 2.5+ with C implemenation of ElementTree installed? - import xml.etree.cElementTree as etree_in_c - except ImportError: - try: # Is it Python 2.5+ with Python implementation of ElementTree? - import xml.etree.ElementTree as etree - except ImportError: - try: # An earlier version of Python with cElementTree installed? - import cElementTree as etree_in_c - except ImportError: - try: # An earlier version of Python with Python ElementTree? - import elementtree.ElementTree as etree - except ImportError: - message(CRITICAL, "Failed to import ElementTree") - sys.exit(1) - if etree_in_c and etree_in_c.VERSION < "1.0": - message(CRITICAL, "For cElementTree version 1.0 or higher is required.") - sys.exit(1) - elif etree_in_c : - return etree_in_c - elif etree.VERSION < "1.1": - message(CRITICAL, "For ElementTree version 1.1 or higher is required") - sys.exit(1) - else : - return etree - -def isBlockLevel(tag): - """Check if the tag is a block level HTML tag.""" - return BLOCK_LEVEL_ELEMENTS.match(tag) - -ATTR_RE = re.compile("\{@([^\}]*)=([^\}]*)}") # {@id=123} - -def handleAttributes(text, parent): - """Set values of an element based on attribute definitions ({@id=123}).""" - def attributeCallback(match): - parent.set(match.group(1), match.group(2)) - return ATTR_RE.sub(attributeCallback, text) - -def dequote(string): - """Remove quotes from around a string.""" - if ( ( string.startswith('"') and string.endswith('"')) - or (string.startswith("'") and string.endswith("'")) ): - return string[1:-1] - else: - return string - - -""" -OVERALL DESIGN -============================================================================= - -Markdown processing takes place in four steps: - -1. A bunch of "preprocessors" munge the input text. -2. BlockParser() parses the high-level structural elements of the - pre-processed text into an ElementTree. -3. A bunch of "treeprocessors" are run against the ElementTree. One such - treeprocessor runs InlinePatterns against the ElementTree, detecting inline - markup. -4. Some post-processors are run against the text after the ElementTree has - been serialized into text. -5. The output is written to a string. - -Those steps are put together by the Markdown() class. - -The code below is organized as follows: - -1. BlockParser and it's BlockProcessors - does core block parsing. -2. All the preprocessors, patterns, treeprocessors, and postprocessors. -3. Markdown class - does the high-level wrapping. -""" - - -""" -CORE MARKDOWN BLOCKPARSER -============================================================================= - -This parser handles basic parsing of Markdown blocks. It doesn't concern itself -with inline elements such as **bold** or *italics*, but rather just catches -blocks, lists, quotes, etc. - -The BlockParser is made up of a bunch of BlockProssors, each handling a -different type of block. Extensions may add/replace/remove BlockProcessors -as they need to alter how markdown blocks are parsed. - -""" - -class BlockProcessor: - """ Base class for block processors. - - Each subclass will provide the methods below to work with the source and - tree. Each processor will need to define it's own ``test`` and ``run`` - methods. The ``test`` method should return True or False, to indicate - whether the current block should be processed by this processor. If the - test passes, the parser will call the processors ``run`` method. - - """ - - def __init__(self, parser=None): - self.parser = parser - - def lastChild(self, parent): - """ Return the last child of an etree element. """ - if len(parent): - return parent[-1] - else: - return None - - def detab(self, text): - """ Remove a tab from the front of each line of the given text. """ - newtext = [] - lines = text.split('\n') - for line in lines: - if line.startswith(' '*TAB_LENGTH): - newtext.append(line[TAB_LENGTH:]) - elif not line.strip(): - newtext.append('') - else: - break - return '\n'.join(newtext), '\n'.join(lines[len(newtext):]) - - def looseDetab(self, text): - """ Remove a tab from front of lines but allowing dedented lines. """ - lines = text.split('\n') - for i in range(len(lines)): - if lines[i].startswith(' '*TAB_LENGTH): - lines[i] = lines[i][TAB_LENGTH:] - return '\n'.join(lines) - - def test(self, parent, block): - """ Test for block type. Must be overridden by subclasses. - - As the parser loops through processors, it will call the ``test`` method - on each to determine if the given block of text is of that type. This - method must return a boolean ``True`` or ``False``. The actual method of - testing is left to the needs of that particular block type. It could - be as simple as ``block.startswith(some_string)`` or a complex regular - expression. As the block type may be different depending on the parent - of the block (i.e. inside a list), the parent etree element is also - provided and may be used as part of the test. - - Keywords: - - * ``parent``: A etree element which will be the parent of the block. - * ``block``: A block of text from the source which has been split at - blank lines. - """ - pass - - def run(self, parent, blocks): - """ Run processor. Must be overridden by subclasses. - - When the parser determines the appropriate type of a block, the parser - will call the corresponding processor's ``run`` method. This method - should parse the individual lines of the block and append them to - the etree. - - Note that both the ``parent`` and ``etree`` keywords are pointers - to instances of the objects which should be edited in place. Each - processor must make changes to the existing objects as there is no - mechanism to return new/different objects to replace them. - - This means that this method should be adding SubElements or adding text - to the parent, and should remove (``pop``) or add (``insert``) items to - the list of blocks. - - Keywords: - - * ``parent``: A etree element which is the parent of the current block. - * ``blocks``: A list of all remaining blocks of the document. - """ - pass - - -class ListIndentProcessor(BlockProcessor): - """ Process children of list items. - - Example: - * a list item - process this part - - or this part - - """ - - def test(self, parent, block): - return block.startswith(' '*TAB_LENGTH) and \ - (parent.tag == "li" or \ - (len(parent) and parent[-1] and \ - (parent[-1].tag == "ul" or parent[-1].tag == "ol") - ) - ) - - def run(self, parent, blocks): - block = self.looseDetab(blocks.pop(0)) - sibling = self.lastChild(parent) - if parent.tag == 'li': - # The parent is already a li. Just parse the child block. - self.parser.parseBlocks(parent, [block]) - elif len(sibling) and sibling[-1].tag == 'li': - # The parent is a list (``ol`` or ``ul``) which has children. - # Assume the last child li is the parent of this block. - if sibling[-1].text: - # If the parent li has text, that text needs to be moved to a p - block = '%s\n\n%s' % (sibling[-1].text, block) - sibling[-1].text = '' - self.parser.parseChunk(sibling[-1], block) - else: - # Create a new li and parse the block with it as the parent. - li = etree.SubElement(sibling, 'li') - self.parser.parseBlocks(li, [block]) - - -class CodeBlockProcessor(BlockProcessor): - """ Process code blocks. """ - - def test(self, parent, block): - return block.startswith(' '*TAB_LENGTH) - - def run(self, parent, blocks): - sibling = self.lastChild(parent) - block = blocks.pop(0) - theRest = '' - if sibling and sibling.tag == "pre" and len(sibling) \ - and sibling[0].tag == "code": - # The previous block was a code block. As blank lines do not start - # new code blocks, append this block to the previous, adding back - # linebreaks removed from the split into a list. - code = sibling[0] - block, theRest = self.detab(block) - code.text = AtomicString('%s\n%s\n' % (code.text, block.rstrip())) - else: - # This is a new codeblock. Create the elements and insert text. - pre = etree.SubElement(parent, 'pre') - code = etree.SubElement(pre, 'code') - block, theRest = self.detab(block) - code.text = AtomicString('%s\n' % block.rstrip()) - if theRest: - # This block contained unindented line(s) after the first indented - # line. Insert these lines as the first block of the master blocks - # list for future processing. - blocks.insert(0, theRest) - - -class BlockQuoteProcessor(BlockProcessor): - - RE = re.compile(r'(^|\n)[ ]{0,3}>[ ](.*)') - - def test(self, parent, block): - return bool(self.RE.search(block)) - - def run(self, parent, blocks): - block = blocks.pop(0) - m = self.RE.search(block) - if m: - before = block[:m.start()] # Lines before blockquote - # Pass lines before blockquote in recursively for parsing forst. - self.parser.parseBlocks(parent, [before]) - # Remove ``> `` from begining of each line. - block = '\n'.join([self.clean(line) for line in - block[m.start():].split('\n')]) - sibling = self.lastChild(parent) - if sibling and sibling.tag == "blockquote": - # Previous block was a blockquote so set that as this blocks parent - quote = sibling - else: - # This is a new blockquote. Create a new parent element. - quote = etree.SubElement(parent, 'blockquote') - # Recursively parse block with blockquote as parent. - self.parser.parseChunk(quote, block) - - def clean(self, line): - """ Remove ``>`` from beginning of a line. """ - m = self.RE.match(line) - if line.strip() == ">": - return "" - elif m: - return m.group(2) - else: - return line - -class OListProcessor(BlockProcessor): - """ Process ordered list blocks. """ - - TAG = 'ol' - # Detect an item (``1. item``). ``group(1)`` contains contents of item. - RE = re.compile(r'^[ ]{0,3}\d+\.[ ](.*)') - # Detect items on secondary lines. they can be of either list type. - CHILD_RE = re.compile(r'^[ ]{0,3}((\d+\.)|[*+-])[ ](.*)') - # Detect indented (nested) items of either type - INDENT_RE = re.compile(r'^[ ]{4,7}((\d+\.)|[*+-])[ ].*') - - def test(self, parent, block): - return bool(self.RE.match(block)) - - def run(self, parent, blocks): - # Check fr multiple items in one block. - items = self.get_items(blocks.pop(0)) - sibling = self.lastChild(parent) - if sibling and (sibling.tag == 'ol' or sibling.tag == 'ul'): - # Previous block was a list item, so set that as parent - lst = sibling - # make sure previous item is in a p. - if len(lst) and lst[-1].text and not len(lst[-1]): - p = etree.SubElement(lst[-1], 'p') - p.text = lst[-1].text - lst[-1].text = '' - # parse first block differently as it gets wrapped in a p. - li = etree.SubElement(lst, 'li') - self.parser.state.set('looselist') - firstitem = items.pop(0) - self.parser.parseBlocks(li, [firstitem]) - self.parser.state.reset() - else: - # This is a new list so create parent with appropriate tag. - lst = etree.SubElement(parent, self.TAG) - self.parser.state.set('list') - # Loop through items in block, recursively parsing each with the - # appropriate parent. - for item in items: - if item.startswith(' '*TAB_LENGTH): - # Item is indented. Parse with last item as parent - self.parser.parseBlocks(lst[-1], [item]) - else: - # New item. Create li and parse with it as parent - li = etree.SubElement(lst, 'li') - self.parser.parseBlocks(li, [item]) - self.parser.state.reset() - - def get_items(self, block): - """ Break a block into list items. """ - items = [] - for line in block.split('\n'): - m = self.CHILD_RE.match(line) - if m: - # This is a new item. Append - items.append(m.group(3)) - elif self.INDENT_RE.match(line): - # This is an indented (possibly nested) item. - if items[-1].startswith(' '*TAB_LENGTH): - # Previous item was indented. Append to that item. - items[-1] = '%s\n%s' % (items[-1], line) - else: - items.append(line) - else: - # This is another line of previous item. Append to that item. - items[-1] = '%s\n%s' % (items[-1], line) - return items - - -class UListProcessor(OListProcessor): - """ Process unordered list blocks. """ - - TAG = 'ul' - RE = re.compile(r'^[ ]{0,3}[*+-][ ](.*)') - - -class HashHeaderProcessor(BlockProcessor): - """ Process Hash Headers. """ - - # Detect a header at start of any line in block - RE = re.compile(r'(^|\n)(?P#{1,6})(?P
.*?)#*(\n|$)') - - def test(self, parent, block): - return bool(self.RE.search(block)) - - def run(self, parent, blocks): - block = blocks.pop(0) - m = self.RE.search(block) - if m: - before = block[:m.start()] # All lines before header - after = block[m.end():] # All lines after header - if before: - # As the header was not the first line of the block and the - # lines before the header must be parsed first, - # recursively parse this lines as a block. - self.parser.parseBlocks(parent, [before]) - # Create header using named groups from RE - h = etree.SubElement(parent, 'h%d' % len(m.group('level'))) - h.text = m.group('header').strip() - if after: - # Insert remaining lines as first block for future parsing. - blocks.insert(0, after) - else: - # This should never happen, but just in case... - message(CRITICAL, "We've got a problem header!") - - -class SetextHeaderProcessor(BlockProcessor): - """ Process Setext-style Headers. """ - - # Detect Setext-style header. Must be first 2 lines of block. - RE = re.compile(r'^.*?\n[=-]{3,}', re.MULTILINE) - - def test(self, parent, block): - return bool(self.RE.match(block)) - - def run(self, parent, blocks): - lines = blocks.pop(0).split('\n') - # Determine level. ``=`` is 1 and ``-`` is 2. - if lines[1].startswith('='): - level = 1 - else: - level = 2 - h = etree.SubElement(parent, 'h%d' % level) - h.text = lines[0].strip() - if len(lines) > 2: - # Block contains additional lines. Add to master blocks for later. - blocks.insert(0, '\n'.join(lines[2:])) - - -class HRProcessor(BlockProcessor): - """ Process Horizontal Rules. """ - - RE = r'[ ]{0,3}(?P[*_-])[ ]?((?P=ch)[ ]?){2,}[ ]*' - # Detect hr on any line of a block. - SEARCH_RE = re.compile(r'(^|\n)%s(\n|$)' % RE) - # Match a hr on a single line of text. - MATCH_RE = re.compile(r'^%s$' % RE) - - def test(self, parent, block): - return bool(self.SEARCH_RE.search(block)) - - def run(self, parent, blocks): - lines = blocks.pop(0).split('\n') - prelines = [] - # Check for lines in block before hr. - for line in lines: - m = self.MATCH_RE.match(line) - if m: - break - else: - prelines.append(line) - if len(prelines): - # Recursively parse lines before hr so they get parsed first. - self.parser.parseBlocks(parent, ['\n'.join(prelines)]) - # create hr - hr = etree.SubElement(parent, 'hr') - # check for lines in block after hr. - lines = lines[len(prelines)+1:] - if len(lines): - # Add lines after hr to master blocks for later parsing. - blocks.insert(0, '\n'.join(lines)) - - -class EmptyBlockProcessor(BlockProcessor): - """ Process blocks and start with an empty line. """ - - # Detect a block that only contains whitespace - # or only whitespace on the first line. - RE = re.compile(r'^\s*\n') - - def test(self, parent, block): - return bool(self.RE.match(block)) - - def run(self, parent, blocks): - block = blocks.pop(0) - m = self.RE.match(block) - if m: - # Add remaining line to master blocks for later. - blocks.insert(0, block[m.end():]) - sibling = self.lastChild(parent) - if sibling and sibling.tag == 'pre' and sibling[0] and \ - sibling[0].tag == 'code': - # Last block is a codeblock. Append to preserve whitespace. - sibling[0].text = AtomicString('%s/n/n/n' % sibling[0].text ) - - -class ParagraphProcessor(BlockProcessor): - """ Process Paragraph blocks. """ - - def test(self, parent, block): - return True - - def run(self, parent, blocks): - block = blocks.pop(0) - if block.strip(): - # Not a blank block. Add to parent, otherwise throw it away. - if self.parser.state.isstate('list'): - # The parent is a tight-list. Append to parent.text - if parent.text: - parent.text = '%s\n%s' % (parent.text, block) - else: - parent.text = block.lstrip() - else: - # Create a regular paragraph - p = etree.SubElement(parent, 'p') - p.text = block.lstrip() - -class State(list): - """ Track the current and nested state of the parser. - - This utility class is used to track the state of the BlockParser and - support multiple levels if nesting. It's just a simple API wrapped around - a list. Each time a state is set, that state is appended to the end of the - list. Each time a state is reset, that state is removed from the end of - the list. - - Therefore, each time a state is set for a nested block, that state must be - reset when we back out of that level of nesting or the state could be - corrupted. - - While all the methods of a list object are available, only the three - defined below need be used. - - """ - - def set(self, state): - """ Set a new state. """ - self.append(state) - - def reset(self): - """ Step back one step in nested state. """ - self.pop() - - def isstate(self, state): - """ Test that top (current) level is of given state. """ - if len(self): - return self[-1] == state - else: - return False - -class BlockParser: - """ Parse Markdown blocks into an ElementTree object. - - A wrapper class that stitches the various BlockProcessors together, - looping through them and creating an ElementTree object. - """ - - def __init__(self): - self.blockprocessors = OrderedDict() - self.blockprocessors['empty'] = EmptyBlockProcessor(self) - self.blockprocessors['indent'] = ListIndentProcessor(self) - self.blockprocessors['code'] = CodeBlockProcessor(self) - self.blockprocessors['hashheader'] = HashHeaderProcessor(self) - self.blockprocessors['setextheader'] = SetextHeaderProcessor(self) - self.blockprocessors['hr'] = HRProcessor(self) - self.blockprocessors['olist'] = OListProcessor(self) - self.blockprocessors['ulist'] = UListProcessor(self) - self.blockprocessors['quote'] = BlockQuoteProcessor(self) - self.blockprocessors['paragraph'] = ParagraphProcessor(self) - self.state = State() - - def parseDocument(self, lines): - """ Parse a markdown document into an ElementTree. - - Given a list of lines, an ElementTree object (not just a parent Element) - is created and the root element is passed to the parser as the parent. - The ElementTree object is returned. - - This should only be called on an entire document, not pieces. - - """ - # Create a ElementTree from the lines - root = etree.Element("div") - self.parseChunk(root, '\n'.join(lines)) - return etree.ElementTree(root) - - def parseChunk(self, parent, text): - """ Parse a chunk of markdown text and attach to given etree node. - - While the ``text`` argument is generally assumed to contain multiple - blocks which will be split on blank lines, it could contain only one - block. Generally, this method would be called by extensions when - block parsing is required. - - The ``parent`` etree Element passed in is altered in place. - Nothing is returned. - - """ - self.parseBlocks(parent, text.split('\n\n')) - - def parseBlocks(self, parent, blocks): - """ Process blocks of markdown text and attach to given etree node. - - Given a list of ``blocks``, each blockprocessor is stepped through - until there are no blocks left. While an extension could potentially - call this method directly, it's generally expected to be used internally. - - This is a public method as an extension may need to add/alter additional - BlockProcessors which call this method to recursively parse a nested - block. - - """ - while blocks: - for processor in self.blockprocessors.values(): - if processor.test(parent, blocks[0]): - processor.run(parent, blocks) - break - - -""" -PRE-PROCESSORS -============================================================================= - -Preprocessors work on source text before we start doing anything too -complicated. -""" - -class Processor: - def __init__(self, markdown_instance=None): - if markdown_instance: - self.markdown = markdown_instance - - -class Preprocessor (Processor): - """ - Preprocessors are run after the text is broken into lines. - - Each preprocessor implements a "run" method that takes a pointer to a - list of lines of the document, modifies it as necessary and returns - either the same pointer or a pointer to a new list. - - Preprocessors must extend markdown.Preprocessor. - - """ - def run(self, lines): - """ - Each subclass of Preprocessor should override the `run` method, which - takes the document as a list of strings split by newlines and returns - the (possibly modified) list of lines. - - """ - pass - - -class HtmlBlockPreprocessor(Preprocessor): - """Remove html blocks from the text and store them for later retrieval.""" - - right_tag_patterns = ["", "%s>"] - - def _get_left_tag(self, block): - return block[1:].replace(">", " ", 1).split()[0].lower() - - def _get_right_tag(self, left_tag, block): - for p in self.right_tag_patterns: - tag = p % left_tag - i = block.rfind(tag) - if i > 2: - return tag.lstrip("<").rstrip(">"), i + len(p)-2 + len(left_tag) - return block.rstrip()[-len(left_tag)-2:-1].lower(), len(block) - - def _equal_tags(self, left_tag, right_tag): - if left_tag == 'div' or left_tag[0] in ['?', '@', '%']: # handle PHP, etc. - return True - if ("/" + left_tag) == right_tag: - return True - if (right_tag == "--" and left_tag == "--"): - return True - elif left_tag == right_tag[1:] \ - and right_tag[0] != "<": - return True - else: - return False - - def _is_oneliner(self, tag): - return (tag in ['hr', 'hr/']) - - def run(self, lines): - text = "\n".join(lines) - new_blocks = [] - text = text.split("\n\n") - items = [] - left_tag = '' - right_tag = '' - in_tag = False # flag - - while text: - block = text[0] - if block.startswith("\n"): - block = block[1:] - text = text[1:] - - if block.startswith("\n"): - block = block[1:] - - if not in_tag: - if block.startswith("<"): - left_tag = self._get_left_tag(block) - right_tag, data_index = self._get_right_tag(left_tag, block) - - if data_index < len(block): - text.insert(0, block[data_index:]) - block = block[:data_index] - - if not (isBlockLevel(left_tag) \ - or block[1] in ["!", "?", "@", "%"]): - new_blocks.append(block) - continue - - if self._is_oneliner(left_tag): - new_blocks.append(block.strip()) - continue - - if block[1] == "!": - # is a comment block - left_tag = "--" - right_tag, data_index = self._get_right_tag(left_tag, block) - # keep checking conditions below and maybe just append - - if block.rstrip().endswith(">") \ - and self._equal_tags(left_tag, right_tag): - new_blocks.append( - self.markdown.htmlStash.store(block.strip())) - continue - else: #if not block[1] == "!": - # if is block level tag and is not complete - - if isBlockLevel(left_tag) or left_tag == "--" \ - and not block.rstrip().endswith(">"): - items.append(block.strip()) - in_tag = True - else: - new_blocks.append( - self.markdown.htmlStash.store(block.strip())) - - continue - - new_blocks.append(block) - - else: - items.append(block.strip()) - - right_tag, data_index = self._get_right_tag(left_tag, block) - - if self._equal_tags(left_tag, right_tag): - # if find closing tag - in_tag = False - new_blocks.append( - self.markdown.htmlStash.store('\n\n'.join(items))) - items = [] - - if items: - new_blocks.append(self.markdown.htmlStash.store('\n\n'.join(items))) - new_blocks.append('\n') - - new_text = "\n\n".join(new_blocks) - return new_text.split("\n") - - -class ReferencePreprocessor(Preprocessor): - """ Remove reference definitions from text and store for later use. """ - - RE = re.compile(r'^(\ ?\ ?\ ?)\[([^\]]*)\]:\s*([^ ]*)(.*)$', re.DOTALL) - - def run (self, lines): - new_text = []; - for line in lines: - m = self.RE.match(line) - if m: - id = m.group(2).strip().lower() - t = m.group(4).strip() # potential title - if not t: - self.markdown.references[id] = (m.group(3), t) - elif (len(t) >= 2 - and (t[0] == t[-1] == "\"" - or t[0] == t[-1] == "\'" - or (t[0] == "(" and t[-1] == ")") ) ): - self.markdown.references[id] = (m.group(3), t[1:-1]) - else: - new_text.append(line) - else: - new_text.append(line) - - return new_text #+ "\n" - - -""" -INLINE PATTERNS -============================================================================= - -Inline patterns such as *emphasis* are handled by means of auxiliary -objects, one per pattern. Pattern objects must be instances of classes -that extend markdown.Pattern. Each pattern object uses a single regular -expression and needs support the following methods: - - pattern.getCompiledRegExp() # returns a regular expression - - pattern.handleMatch(m) # takes a match object and returns - # an ElementTree element or just plain text - -All of python markdown's built-in patterns subclass from Pattern, -but you can add additional patterns that don't. - -Also note that all the regular expressions used by inline must -capture the whole block. For this reason, they all start with -'^(.*)' and end with '(.*)!'. In case with built-in expression -Pattern takes care of adding the "^(.*)" and "(.*)!". - -Finally, the order in which regular expressions are applied is very -important - e.g. if we first replace http://.../ links with tags -and _then_ try to replace inline html, we would end up with a mess. -So, we apply the expressions in the following order: - -* escape and backticks have to go before everything else, so - that we can preempt any markdown patterns by escaping them. - -* then we handle auto-links (must be done before inline html) - -* then we handle inline HTML. At this point we will simply - replace all inline HTML strings with a placeholder and add - the actual HTML to a hash. - -* then inline images (must be done before links) - -* then bracketed links, first regular then reference-style - -* finally we apply strong and emphasis -""" - - -""" -The actual regular expressions for patterns ------------------------------------------------------------------------------ -""" - -NOBRACKET = r'[^\]\[]*' -BRK = ( r'\[(' - + (NOBRACKET + r'(\[')*6 - + (NOBRACKET+ r'\])*')*6 - + NOBRACKET + r')\]' ) -NOIMG = r'(?|((?:(?:\(.*?\))|[^\(\)]))*?)\s*((['"])(.*)\12)?\)''' -# [text](url) or [text]() - -IMAGE_LINK_RE = r'\!' + BRK + r'\s*\((<.*?>|([^\)]*))\)' -# ![alttxt](http://x.com/) or ![alttxt]() -REFERENCE_RE = NOIMG + BRK+ r'\s*\[([^\]]*)\]' # [Google][3] -IMAGE_REFERENCE_RE = r'\!' + BRK + '\s*\[([^\]]*)\]' # ![alt text][2] -NOT_STRONG_RE = r'( \* )' # stand-alone * or _ -AUTOLINK_RE = r'<((?:f|ht)tps?://[^>]*)>' # -AUTOMAIL_RE = r'<([^> \!]*@[^> ]*)>' # - -HTML_RE = r'(\<([a-zA-Z/][^\>]*?|\!--.*?--)\>)' # <...> -ENTITY_RE = r'(&[\#a-zA-Z0-9]*;)' # & -LINE_BREAK_RE = r' \n' # two spaces at end of line -LINE_BREAK_2_RE = r' $' # two spaces at end of text - - -""" -The pattern classes ------------------------------------------------------------------------------ -""" - -class Pattern: - """Base class that inline patterns subclass. """ - - def __init__ (self, pattern, markdown_instance=None): - """ - Create an instant of an inline pattern. - - Keyword arguments: - - * pattern: A regular expression that matches a pattern - - """ - self.pattern = pattern - self.compiled_re = re.compile("^(.*?)%s(.*?)$" % pattern, re.DOTALL) - - # Api for Markdown to pass safe_mode into instance - self.safe_mode = False - if markdown_instance: - self.markdown = markdown_instance - - def getCompiledRegExp (self): - """ Return a compiled regular expression. """ - return self.compiled_re - - def handleMatch(self, m): - """Return a ElementTree element from the given match. - - Subclasses should override this method. - - Keyword arguments: - - * m: A re match object containing a match of the pattern. - - """ - pass - - def type(self): - """ Return class name, to define pattern type """ - return self.__class__.__name__ - -BasePattern = Pattern # for backward compatibility - -class SimpleTextPattern (Pattern): - """ Return a simple text of group(2) of a Pattern. """ - def handleMatch(self, m): - text = m.group(2) - if text == INLINE_PLACEHOLDER_PREFIX: - return None - return text - -class SimpleTagPattern (Pattern): - """ - Return element of type `tag` with a text attribute of group(3) - of a Pattern. - - """ - def __init__ (self, pattern, tag): - Pattern.__init__(self, pattern) - self.tag = tag - - def handleMatch(self, m): - el = etree.Element(self.tag) - el.text = m.group(3) - return el - - -class SubstituteTagPattern (SimpleTagPattern): - """ Return a eLement of type `tag` with no children. """ - def handleMatch (self, m): - return etree.Element(self.tag) - - -class BacktickPattern (Pattern): - """ Return a `` element containing the matching text. """ - def __init__ (self, pattern): - Pattern.__init__(self, pattern) - self.tag = "code" - - def handleMatch(self, m): - el = etree.Element(self.tag) - el.text = AtomicString(m.group(3).strip()) - return el - - -class DoubleTagPattern (SimpleTagPattern): - """Return a ElementTree element nested in tag2 nested in tag1. - - Useful for strong emphasis etc. - - """ - def handleMatch(self, m): - tag1, tag2 = self.tag.split(",") - el1 = etree.Element(tag1) - el2 = etree.SubElement(el1, tag2) - el2.text = m.group(3) - return el1 - - -class HtmlPattern (Pattern): - """ Store raw inline html and return a placeholder. """ - def handleMatch (self, m): - rawhtml = m.group(2) - inline = True - place_holder = self.markdown.htmlStash.store(rawhtml) - return place_holder - - -class LinkPattern (Pattern): - """ Return a link element from the given match. """ - def handleMatch(self, m): - el = etree.Element("a") - el.text = m.group(2) - title = m.group(11) - href = m.group(9) - - if href: - if href[0] == "<": - href = href[1:-1] - el.set("href", self.sanitize_url(href.strip())) - else: - el.set("href", "") - - if title: - title = dequote(title) #.replace('"', """) - el.set("title", title) - return el - - def sanitize_url(self, url): - """ - Sanitize a url against xss attacks in "safe_mode". - - Rather than specifically blacklisting `javascript:alert("XSS")` and all - its aliases (see ), we whitelist known - safe url formats. Most urls contain a network location, however some - are known not to (i.e.: mailto links). Script urls do not contain a - location. Additionally, for `javascript:...`, the scheme would be - "javascript" but some aliases will appear to `urlparse()` to have no - scheme. On top of that relative links (i.e.: "foo/bar.html") have no - scheme. Therefore we must check "path", "parameters", "query" and - "fragment" for any literal colons. We don't check "scheme" for colons - because it *should* never have any and "netloc" must allow the form: - `username:password@host:port`. - - """ - locless_schemes = ['', 'mailto', 'news'] - scheme, netloc, path, params, query, fragment = url = urlparse(url) - safe_url = False - if netloc != '' or scheme in locless_schemes: - safe_url = True - - for part in url[2:]: - if ":" in part: - safe_url = False - - if self.markdown.safeMode and not safe_url: - return '' - else: - return urlunparse(url) - -class ImagePattern(LinkPattern): - """ Return a img element from the given match. """ - def handleMatch(self, m): - el = etree.Element("img") - src_parts = m.group(9).split() - if src_parts: - src = src_parts[0] - if src[0] == "<" and src[-1] == ">": - src = src[1:-1] - el.set('src', self.sanitize_url(src)) - else: - el.set('src', "") - if len(src_parts) > 1: - el.set('title', dequote(" ".join(src_parts[1:]))) - - if ENABLE_ATTRIBUTES: - truealt = handleAttributes(m.group(2), el) - else: - truealt = m.group(2) - - el.set('alt', truealt) - return el - -class ReferencePattern(LinkPattern): - """ Match to a stored reference and return link element. """ - def handleMatch(self, m): - if m.group(9): - id = m.group(9).lower() - else: - # if we got something like "[Google][]" - # we'll use "google" as the id - id = m.group(2).lower() - - if not self.markdown.references.has_key(id): # ignore undefined refs - return None - href, title = self.markdown.references[id] - - text = m.group(2) - return self.makeTag(href, title, text) - - def makeTag(self, href, title, text): - el = etree.Element('a') - - el.set('href', self.sanitize_url(href)) - if title: - el.set('title', title) - - el.text = text - return el - - -class ImageReferencePattern (ReferencePattern): - """ Match to a stored reference and return img element. """ - def makeTag(self, href, title, text): - el = etree.Element("img") - el.set("src", self.sanitize_url(href)) - if title: - el.set("title", title) - el.set("alt", text) - return el - - -class AutolinkPattern (Pattern): - """ Return a link Element given an autolink (``). """ - def handleMatch(self, m): - el = etree.Element("a") - el.set('href', m.group(2)) - el.text = AtomicString(m.group(2)) - return el - -class AutomailPattern (Pattern): - """ - Return a mailto link Element given an automail link (``). - """ - def handleMatch(self, m): - el = etree.Element('a') - email = m.group(2) - if email.startswith("mailto:"): - email = email[len("mailto:"):] - - def codepoint2name(code): - """Return entity definition by code, or the code if not defined.""" - entity = htmlentitydefs.codepoint2name.get(code) - if entity: - return "%s%s;" % (AMP_SUBSTITUTE, entity) - else: - return "%s#%d;" % (AMP_SUBSTITUTE, code) - - letters = [codepoint2name(ord(letter)) for letter in email] - el.text = AtomicString(''.join(letters)) - - mailto = "mailto:" + email - mailto = "".join([AMP_SUBSTITUTE + '#%d;' % - ord(letter) for letter in mailto]) - el.set('href', mailto) - return el - - -""" -POST-PROCESSORS -============================================================================= - -Markdown also allows post-processors, which are similar to preprocessors in -that they need to implement a "run" method. However, they are run after core -processing. - -There are two types of post-processors: Treeprocessor and Postprocessor -""" - -class Treeprocessor(Processor): - """ - Treeprocessors are run on the ElementTree object before serialization. - - Each Treeprocessor implements a "run" method that takes a pointer to an - ElementTree, modifies it as necessary and returns an ElementTree - object. - - Treeprocessors must extend markdown.Treeprocessor. - - """ - def run(self, root): - """ - Subclasses of Treeprocessor should implement a `run` method, which - takes a root ElementTree. This method can return another ElementTree - object, and the existing root ElementTree will be replaced, or it can - modify the current tree and return None. - """ - pass - - -class InlineProcessor(Treeprocessor): - """ - A Treeprocessor that traverses a tree, applying inline patterns. - """ - - def __init__ (self, md): - self.__placeholder_prefix = INLINE_PLACEHOLDER_PREFIX - self.__placeholder_suffix = ETX - self.__placeholder_length = 4 + len(self.__placeholder_prefix) \ - + len(self.__placeholder_suffix) - self.__placeholder_re = re.compile(INLINE_PLACEHOLDER % r'([0-9]{4})') - self.markdown = md - - def __makePlaceholder(self, type): - """ Generate a placeholder """ - id = "%04d" % len(self.stashed_nodes) - hash = INLINE_PLACEHOLDER % id - return hash, id - - def __findPlaceholder(self, data, index): - """ - Extract id from data string, start from index - - Keyword arguments: - - * data: string - * index: index, from which we start search - - Returns: placeholder id and string index, after the found placeholder. - """ - - m = self.__placeholder_re.search(data, index) - if m: - return m.group(1), m.end() - else: - return None, index + 1 - - def __stashNode(self, node, type): - """ Add node to stash """ - placeholder, id = self.__makePlaceholder(type) - self.stashed_nodes[id] = node - return placeholder - - def __handleInline(self, data, patternIndex=0): - """ - Process string with inline patterns and replace it - with placeholders - - Keyword arguments: - - * data: A line of Markdown text - * patternIndex: The index of the inlinePattern to start with - - Returns: String with placeholders. - - """ - if not isinstance(data, AtomicString): - startIndex = 0 - while patternIndex < len(self.markdown.inlinePatterns): - data, matched, startIndex = self.__applyPattern( - self.markdown.inlinePatterns.value_for_index(patternIndex), - data, patternIndex, startIndex) - if not matched: - patternIndex += 1 - return data - - def __processElementText(self, node, subnode, isText=True): - """ - Process placeholders in Element.text or Element.tail - of Elements popped from self.stashed_nodes. - - Keywords arguments: - - * node: parent node - * subnode: processing node - * isText: bool variable, True - it's text, False - it's tail - - Returns: None - - """ - if isText: - text = subnode.text - subnode.text = None - else: - text = subnode.tail - subnode.tail = None - - childResult = self.__processPlaceholders(text, subnode) - - if not isText and node is not subnode: - pos = node.getchildren().index(subnode) - node.remove(subnode) - else: - pos = 0 - - childResult.reverse() - for newChild in childResult: - node.insert(pos, newChild) - - def __processPlaceholders(self, data, parent): - """ - Process string with placeholders and generate ElementTree tree. - - Keyword arguments: - - * data: string with placeholders instead of ElementTree elements. - * parent: Element, which contains processing inline data - - Returns: list with ElementTree elements with applied inline patterns. - """ - def linkText(text): - if text: - if result: - if result[-1].tail: - result[-1].tail += text - else: - result[-1].tail = text - else: - if parent.text: - parent.text += text - else: - parent.text = text - - result = [] - strartIndex = 0 - while data: - index = data.find(self.__placeholder_prefix, strartIndex) - if index != -1: - id, phEndIndex = self.__findPlaceholder(data, index) - - if self.stashed_nodes.has_key(id): - node = self.stashed_nodes.get(id) - - if index > 0: - text = data[strartIndex:index] - linkText(text) - - if not isString(node): # it's Element - for child in [node] + node.getchildren(): - if child.tail: - if child.tail.strip(): - self.__processElementText(node, child, False) - if child.text: - if child.text.strip(): - self.__processElementText(child, child) - else: # it's just a string - linkText(node) - strartIndex = phEndIndex - continue - - strartIndex = phEndIndex - result.append(node) - - else: # wrong placeholder - end = index + len(prefix) - linkText(data[strartIndex:end]) - strartIndex = end - else: - text = data[strartIndex:] - linkText(text) - data = "" - - return result - - def __applyPattern(self, pattern, data, patternIndex, startIndex=0): - """ - Check if the line fits the pattern, create the necessary - elements, add it to stashed_nodes. - - Keyword arguments: - - * data: the text to be processed - * pattern: the pattern to be checked - * patternIndex: index of current pattern - * startIndex: string index, from which we starting search - - Returns: String with placeholders instead of ElementTree elements. - - """ - match = pattern.getCompiledRegExp().match(data[startIndex:]) - leftData = data[:startIndex] - - if not match: - return data, False, 0 - - node = pattern.handleMatch(match) - - if node is None: - return data, True, len(leftData) + match.span(len(match.groups()))[0] - - if not isString(node): - if not isinstance(node.text, AtomicString): - # We need to process current node too - for child in [node] + node.getchildren(): - if not isString(node): - if child.text: - child.text = self.__handleInline(child.text, - patternIndex + 1) - if child.tail: - child.tail = self.__handleInline(child.tail, - patternIndex) - - placeholder = self.__stashNode(node, pattern.type()) - - return "%s%s%s%s" % (leftData, - match.group(1), - placeholder, match.groups()[-1]), True, 0 - - def run(self, tree): - """Apply inline patterns to a parsed Markdown tree. - - Iterate over ElementTree, find elements with inline tag, apply inline - patterns and append newly created Elements to tree. If you don't - want process your data with inline paterns, instead of normal string, - use subclass AtomicString: - - node.text = AtomicString("data won't be processed with inline patterns") - - Arguments: - - * markdownTree: ElementTree object, representing Markdown tree. - - Returns: ElementTree object with applied inline patterns. - - """ - self.stashed_nodes = {} - - stack = [tree] - - while stack: - currElement = stack.pop() - insertQueue = [] - for child in currElement.getchildren(): - if child.text and not isinstance(child.text, AtomicString): - text = child.text - child.text = None - lst = self.__processPlaceholders(self.__handleInline( - text), child) - stack += lst - insertQueue.append((child, lst)) - - if child.getchildren(): - stack.append(child) - - for element, lst in insertQueue: - if element.text: - element.text = handleAttributes(element.text, element) - i = 0 - for newChild in lst: - # Processing attributes - if newChild.tail: - newChild.tail = handleAttributes(newChild.tail, - element) - if newChild.text: - newChild.text = handleAttributes(newChild.text, - newChild) - element.insert(i, newChild) - i += 1 - - return tree - - -class PrettifyTreeprocessor(Treeprocessor): - """Add linebreaks to the html document.""" - def _prettifyETree(self, elem): - """Recursively add linebreaks to ElementTree children.""" - i = "\n" - if isBlockLevel(elem.tag) and elem.tag not in ['code', 'pre']: - if (not elem.text or not elem.text.strip()) \ - and len(elem) and isBlockLevel(elem[0].tag): - elem.text = i - for e in elem: - if isBlockLevel(e.tag): - self._prettifyETree(e) - if not elem.tail or not elem.tail.strip(): - elem.tail = i - if not elem.tail or not elem.tail.strip(): - elem.tail = i - - def run(self, root): - """.Add linebreaks to ElementTree root object.""" - self._prettifyETree(root) - # Do
's seperately as they are often in the middle of - # inline content and missed by _prettifyETree. - brs = root.getiterator('br') - for br in brs: - if not br.tail or not br.tail.strip(): - br.tail = '\n' - else: - br.tail = '\n%s' % br.tail - - -class Postprocessor(Processor): - """ - Postprocessors are run after the ElementTree it converted back into text. - - Each Postprocessor implements a "run" method that takes a pointer to a - text string, modifies it as necessary and returns a text string. - - Postprocessors must extend markdown.Postprocessor. - - """ - - def run(self, text): - """ - Subclasses of Postprocessor should implement a `run` method, which - takes the html document as a single text string and returns a - (possibly modified) string. - - """ - pass - - - -class RawHtmlPostprocessor(Postprocessor): - """ Restore raw html to the document. """ - - def run(self, text): - """ Iterate over html stash and restore "safe" html. """ - for i in range(self.markdown.htmlStash.html_counter): - html, safe = self.markdown.htmlStash.rawHtmlBlocks[i] - if self.markdown.safeMode and not safe: - if str(self.markdown.safeMode).lower() == 'escape': - html = self.escape(html) - elif str(self.markdown.safeMode).lower() == 'remove': - html = '' - else: - html = HTML_REMOVED_TEXT - if safe or not self.markdown.safeMode: - text = text.replace("

%s

" % (HTML_PLACEHOLDER % i), - html + "\n") - text = text.replace(HTML_PLACEHOLDER % i, html) - return text - - def escape(self, html): - """ Basic html escaping """ - html = html.replace('&', '&') - html = html.replace('<', '<') - html = html.replace('>', '>') - return html.replace('"', '"') - - -class AndSubstitutePostprocessor(Postprocessor): - """ Restore valid entities """ - def __init__(self): - pass - - def run(self, text): - text = text.replace(AMP_SUBSTITUTE, "&") - return text - - -""" -MISC AUXILIARY CLASSES -============================================================================= -""" - -class AtomicString(unicode): - """A string which should not be further processed.""" - pass - - -class HtmlStash: - """ - This class is used for stashing HTML objects that we extract - in the beginning and replace with place-holders. - """ - - def __init__ (self): - """ Create a HtmlStash. """ - self.html_counter = 0 # for counting inline html segments - self.rawHtmlBlocks=[] - - def store(self, html, safe=False): - """ - Saves an HTML segment for later reinsertion. Returns a - placeholder string that needs to be inserted into the - document. - - Keyword arguments: - - * html: an html segment - * safe: label an html segment as safe for safemode - - Returns : a placeholder string - - """ - self.rawHtmlBlocks.append((html, safe)) - placeholder = HTML_PLACEHOLDER % self.html_counter - self.html_counter += 1 - return placeholder - - def reset(self): - self.html_counter = 0 - self.rawHtmlBlocks = [] - -class OrderedDict(dict): - """ - A dictionary that keeps its keys in the order in which they're inserted. - - Copied from Django's SortedDict with some modifications. - - """ - def __new__(cls, *args, **kwargs): - instance = super(OrderedDict, cls).__new__(cls, *args, **kwargs) - instance.keyOrder = [] - return instance - - def __init__(self, data=None): - if data is None: - data = {} - super(OrderedDict, self).__init__(data) - if isinstance(data, dict): - self.keyOrder = data.keys() - else: - self.keyOrder = [] - for key, value in data: - if key not in self.keyOrder: - self.keyOrder.append(key) - - def __deepcopy__(self, memo): - from copy import deepcopy - return self.__class__([(key, deepcopy(value, memo)) - for key, value in self.iteritems()]) - - def __setitem__(self, key, value): - super(OrderedDict, self).__setitem__(key, value) - if key not in self.keyOrder: - self.keyOrder.append(key) - - def __delitem__(self, key): - super(OrderedDict, self).__delitem__(key) - self.keyOrder.remove(key) - - def __iter__(self): - for k in self.keyOrder: - yield k - - def pop(self, k, *args): - result = super(OrderedDict, self).pop(k, *args) - try: - self.keyOrder.remove(k) - except ValueError: - # Key wasn't in the dictionary in the first place. No problem. - pass - return result - - def popitem(self): - result = super(OrderedDict, self).popitem() - self.keyOrder.remove(result[0]) - return result - - def items(self): - return zip(self.keyOrder, self.values()) - - def iteritems(self): - for key in self.keyOrder: - yield key, super(OrderedDict, self).__getitem__(key) - - def keys(self): - return self.keyOrder[:] - - def iterkeys(self): - return iter(self.keyOrder) - - def values(self): - return [super(OrderedDict, self).__getitem__(k) for k in self.keyOrder] - - def itervalues(self): - for key in self.keyOrder: - yield super(OrderedDict, self).__getitem__(key) - - def update(self, dict_): - for k, v in dict_.items(): - self.__setitem__(k, v) - - def setdefault(self, key, default): - if key not in self.keyOrder: - self.keyOrder.append(key) - return super(OrderedDict, self).setdefault(key, default) - - def value_for_index(self, index): - """Return the value of the item at the given zero-based index.""" - return self[self.keyOrder[index]] - - def insert(self, index, key, value): - """Insert the key, value pair before the item with the given index.""" - if key in self.keyOrder: - n = self.keyOrder.index(key) - del self.keyOrder[n] - if n < index: - index -= 1 - self.keyOrder.insert(index, key) - super(OrderedDict, self).__setitem__(key, value) - - def copy(self): - """Return a copy of this object.""" - # This way of initializing the copy means it works for subclasses, too. - obj = self.__class__(self) - obj.keyOrder = self.keyOrder[:] - return obj - - def __repr__(self): - """ - Replace the normal dict.__repr__ with a version that returns the keys - in their sorted order. - """ - return '{%s}' % ', '.join(['%r: %r' % (k, v) for k, v in self.items()]) - - def clear(self): - super(OrderedDict, self).clear() - self.keyOrder = [] - - def index(self, key): - """ Return the index of a given key. """ - return self.keyOrder.index(key) - - def index_for_location(self, location): - """ Return index or None for a given location. """ - if location == '_begin': - i = 0 - elif location == '_end': - i = None - elif location.startswith('<') or location.startswith('>'): - i = self.index(location[1:]) - if location.startswith('>'): - if i >= len(self): - # last item - i = None - else: - i += 1 - else: - raise ValueError('Not a valid location: "%s". Location key ' - 'must start with a ">" or "<".' % location) - return i - - def add(self, key, value, location): - """ Insert by key location. """ - i = self.index_for_location(location) - if i is not None: - self.insert(i, key, value) - else: - self.__setitem__(key, value) - - def link(self, key, location): - """ Change location of an existing item. """ - n = self.keyOrder.index(key) - del self.keyOrder[n] - i = self.index_for_location(location) - try: - if i is not None: - self.keyOrder.insert(i, key) - else: - self.keyOrder.append(key) - except Error: - # restore to prevent data loss and reraise - self.keyOrder.insert(n, key) - raise Error - - -""" -Markdown -============================================================================= -""" - -class Markdown: - """Convert Markdown to HTML.""" - - def __init__(self, - extensions=[], - extension_configs={}, - safe_mode = False): - """ - Creates a new Markdown instance. - - Keyword arguments: - - * extensions: A list of extensions. - If they are of type string, the module mdx_name.py will be loaded. - If they are a subclass of markdown.Extension, they will be used - as-is. - * extension-configs: Configuration setting for extensions. - * safe_mode: Disallow raw html. One of "remove", "replace" or "escape". - - """ - self.parser = BlockParser() - self.safeMode = safe_mode - self.registeredExtensions = [] - self.docType = "" - self.stripTopLevelTags = True - - self.preprocessors = OrderedDict() - self.preprocessors["html_block"] = HtmlBlockPreprocessor(self) - self.preprocessors["reference"] = ReferencePreprocessor(self) - # footnote preprocessor will be inserted with "amp_substitute" - - self.prePatterns = [] - - self.inlinePatterns = OrderedDict() - self.inlinePatterns["backtick"] = BacktickPattern(BACKTICK_RE) - self.inlinePatterns["escape"] = SimpleTextPattern(ESCAPE_RE) - self.inlinePatterns["reference"] = ReferencePattern(REFERENCE_RE, self) - self.inlinePatterns["link"] = LinkPattern(LINK_RE, self) - self.inlinePatterns["image_link"] = ImagePattern(IMAGE_LINK_RE, self) - self.inlinePatterns["image_reference"] = \ - ImageReferencePattern(IMAGE_REFERENCE_RE, self) - self.inlinePatterns["autolink"] = AutolinkPattern(AUTOLINK_RE, self) - self.inlinePatterns["automail"] = AutomailPattern(AUTOMAIL_RE, self) - self.inlinePatterns["linebreak2"] = \ - SubstituteTagPattern(LINE_BREAK_2_RE, 'br') - self.inlinePatterns["linebreak"] = \ - SubstituteTagPattern(LINE_BREAK_RE, 'br') - self.inlinePatterns["html"] = HtmlPattern(HTML_RE, self) - self.inlinePatterns["entity"] = HtmlPattern(ENTITY_RE, self) - self.inlinePatterns["not_strong"] = SimpleTextPattern(NOT_STRONG_RE) - self.inlinePatterns["strong_em"] = \ - DoubleTagPattern(STRONG_EM_RE, 'strong,em') - self.inlinePatterns["strong"] = SimpleTagPattern(STRONG_RE, 'strong') - self.inlinePatterns["emphasis"] = SimpleTagPattern(EMPHASIS_RE, 'em') - self.inlinePatterns["emphasis2"] = \ - SimpleTagPattern(EMPHASIS_2_RE, 'em') - # The order of the handlers matters!!! - - self.references = {} - self.htmlStash = HtmlStash() - self.registerExtensions(extensions = extensions, - configs = extension_configs) - self.reset() - - def registerExtensions(self, extensions, configs): - """ - Register extensions with this instance of Markdown. - - Keyword aurguments: - - * extensions: A list of extensions, which can either - be strings or objects. See the docstring on Markdown. - * configs: A dictionary mapping module names to config options. - - """ - for ext in extensions: - if isinstance(ext, basestring): - ext = load_extension(ext, configs.get(ext, [])) - elif hasattr(ext, 'extendMarkdown'): - # Looks like an Extension. - # Nothing to do here. - pass - else: - message(ERROR, "Incorrect type! Extension '%s' is " - "neither a string or an Extension." %(repr(ext))) - continue - ext.extendMarkdown(self, globals()) - - def registerExtension(self, extension): - """ This gets called by the extension """ - self.registeredExtensions.append(extension) - - def reset(self): - """ - Resets all state variables so that we can start with a new text. - """ - self.htmlStash.reset() - self.references.clear() - - for extension in self.registeredExtensions: - extension.reset() - - def convert (self, source): - """Convert markdown to serialized XHTML.""" - - # Fixup the source text - if not source: - return u"" # a blank unicode string - try: - source = unicode(source) - except UnicodeDecodeError: - message(CRITICAL, 'UnicodeDecodeError: Markdown only accepts unicode or ascii input.') - return u"" - - source = source.replace(STX, "").replace(ETX, "") - source = source.replace("\r\n", "\n").replace("\r", "\n") + "\n\n" - source = re.sub(r'\n\s+\n', '\n\n', source) - source = source.expandtabs(TAB_LENGTH) - - # Split into lines and run the line preprocessors. - self.lines = source.split("\n") - for prep in self.preprocessors.values(): - self.lines = prep.run(self.lines) - - # Parse the high-level elements. - root = self.parser.parseDocument(self.lines).getroot() - - # Run the tree-processors - for treeprocessor in self.treeprocessors.values(): - newRoot = treeprocessor.run(root) - if newRoot: - root = newRoot - - # Serialize _properly_. Strip top-level tags. - xml, length = codecs.utf_8_decode(etree.tostring(root, encoding="utf8")) - if self.stripTopLevelTags: - xml = xml.strip()[44:-7] + "\n" - - # Run the text post-processors - for pp in self.postprocessors.values(): - xml = pp.run(xml) - - return xml.strip() - - def convertFile(self, input = None, output = None, encoding = None): - """Converts a markdown file and returns the HTML as a unicode string. - - Decodes the file using the provided encoding (defaults to utf-8), - passes the file content to markdown, and outputs the html to either - the provided stream or the file with provided name, using the same - encoding as the source file. - - **Note:** This is the only place that decoding and encoding of unicode - takes place in Python-Markdown. (All other code is unicode-in / - unicode-out.) - - Keyword arguments: - - * input: Name of source text file. - * output: Name of output file. Writes to stdout if `None`. - * extensions: A list of extension names (may contain config args). - * encoding: Encoding of input and output files. Defaults to utf-8. - * safe_mode: Disallow raw html. One of "remove", "replace" or "escape". - - """ - - encoding = encoding or "utf-8" - - # Read the source - input_file = codecs.open(input, mode="r", encoding=encoding) - text = input_file.read() - input_file.close() - text = text.lstrip(u'\ufeff') # remove the byte-order mark - - # Convert - html = self.convert(text) - - # Write to file or stdout - if type(output) == type("string"): - output_file = codecs.open(output, "w", encoding=encoding) - output_file.write(html) - output_file.close() - else: - output.write(html.encode(encoding)) - - -""" -Extensions ------------------------------------------------------------------------------ -""" - -class Extension: - """ Base class for extensions to subclass. """ - def __init__(self, configs = {}): - """Create an instance of an Extention. - - Keyword arguments: - - * configs: A dict of configuration setting used by an Extension. - """ - self.config = configs - - def getConfig(self, key): - """ Return a setting for the given key or an empty string. """ - if self.config.has_key(key): - return self.config[key][0] - else: - return "" - - def getConfigInfo(self): - """ Return all config settings as a list of tuples. """ - return [(key, self.config[key][1]) for key in self.config.keys()] - - def setConfig(self, key, value): - """ Set a config setting for `key` with the given `value`. """ - self.config[key][0] = value - - def extendMarkdown(self, md, md_globals): - """ - Add the various proccesors and patterns to the Markdown Instance. - - This method must be overriden by every extension. - - Keyword arguments: - - * md: The Markdown instance. - - * md_globals: Global variables in the markdown module namespace. - - """ - pass - -def load_extension(ext_name, configs = []): - """Load extension by name, then return the module. - - The extension name may contain arguments as part of the string in the - following format: "extname(key1=value1,key2=value2)" - - """ - - # Parse extensions config params (ignore the order) - configs = dict(configs) - pos = ext_name.find("(") # find the first "(" - if pos > 0: - ext_args = ext_name[pos+1:-1] - ext_name = ext_name[:pos] - pairs = [x.split("=") for x in ext_args.split(",")] - configs.update([(x.strip(), y.strip()) for (x, y) in pairs]) - - # Setup the module names - ext_module = 'markdown_extensions' - module_name_new_style = '.'.join([ext_module, ext_name]) - module_name_old_style = '_'.join(['mdx', ext_name]) - - # Try loading the extention first from one place, then another - try: # New style (markdown_extensons.) - module = __import__(module_name_new_style, {}, {}, [ext_module]) - except ImportError: - try: # Old style (mdx.) - module = __import__(module_name_old_style) - except ImportError: - message(CRITICAL, "Failed loading extension '%s' from '%s' or '%s'" - % (ext_name, module_name_new_style, module_name_old_style)) - - # If the module is loaded successfully, we expect it to define a - # function called makeExtension() - try: - return module.makeExtension(configs.items()) - except: - message(CRITICAL, "Failed to instantiate extension '%s'" % ext_name) - -def load_extensions(ext_names): - """Loads multiple extensions""" - extensions = [] - for ext_name in ext_names: - extension = load_extension(ext_name) - if extension: - extensions.append(extension) - return extensions - -# Extensions should use "markdown.etree" instead of "etree" (or do `from -# markdown import etree`). Do not import it by yourself. - -etree = importETree() - -""" -EXPORTED FUNCTIONS -============================================================================= - -Those are the two functions we really mean to export: markdown() and -markdownFromFile(). -""" - -def markdown(text, - extensions = [], - safe_mode = False): - """Convert a markdown string to HTML and return HTML as a unicode string. - - This is a shortcut function for `Markdown` class to cover the most - basic use case. It initializes an instance of Markdown, loads the - necessary extensions and runs the parser on the given text. - - Keyword arguments: - - * text: Markdown formatted text as Unicode or ASCII string. - * extensions: A list of extensions or extension names (may contain config args). - * safe_mode: Disallow raw html. One of "remove", "replace" or "escape". - - Returns: An HTML document as a string. - - """ - md = Markdown(extensions=load_extensions(extensions), - safe_mode = safe_mode) - return md.convert(text) - - -def markdownFromFile(input = None, - output = None, - extensions = [], - encoding = None, - safe = False): - """Read markdown code from a file and write it to a file or a stream.""" - md = Markdown(extensions=load_extensions(extensions), safe_mode = safe) - md.convertFile(input, output, encoding) - - -""" -COMMAND-LINE SPECIFIC STUFF -============================================================================= - -The rest of the code is specifically for handling the case where Python -Markdown is called from the command line. -""" - -OPTPARSE_WARNING = """ -Python 2.3 or higher required for advanced command line options. -For lower versions of Python use: - - %s INPUT_FILE > OUTPUT_FILE - -""" % EXECUTABLE_NAME_FOR_USAGE - -def parse_options(): - """ - Define and parse `optparse` options for command-line usage. - """ - - try: - optparse = __import__("optparse") - except: - if len(sys.argv) == 2: - return {'input': sys.argv[1], - 'output': None, - 'safe': False, - 'extensions': [], - 'encoding': None }, CRITICAL - else: - print OPTPARSE_WARNING - return None, None - - parser = optparse.OptionParser(usage="%prog INPUTFILE [options]") - parser.add_option("-f", "--file", dest="filename", default=sys.stdout, - help="write output to OUTPUT_FILE", - metavar="OUTPUT_FILE") - parser.add_option("-e", "--encoding", dest="encoding", - help="encoding for input and output files",) - parser.add_option("-q", "--quiet", default = CRITICAL, - action="store_const", const=CRITICAL+10, dest="verbose", - help="suppress all messages") - parser.add_option("-v", "--verbose", - action="store_const", const=INFO, dest="verbose", - help="print info messages") - parser.add_option("-s", "--safe", dest="safe", default=False, - metavar="SAFE_MODE", - help="safe mode ('replace', 'remove' or 'escape' user's HTML tag)") - parser.add_option("--noisy", - action="store_const", const=DEBUG, dest="verbose", - help="print debug messages") - parser.add_option("-x", "--extension", action="append", dest="extensions", - help = "load extension EXTENSION", metavar="EXTENSION") - - (options, args) = parser.parse_args() - - if not len(args) == 1: - parser.print_help() - return None, None - else: - input_file = args[0] - - if not options.extensions: - options.extensions = [] - - return {'input': input_file, - 'output': options.filename, - 'safe': options.safe, - 'extensions': options.extensions, - 'encoding': options.encoding }, options.verbose - -def command_line_run(): - """Run Markdown from the command line.""" - - # Setup a logger manually for compatibility with Python 2.3 - logger = logging.getLogger('MARKDOWN') - logger.setLevel(COMMAND_LINE_LOGGING_LEVEL) - logger.addHandler(logging.StreamHandler()) - - # Parse options and adjust logging level if necessary - options, logging_level = parse_options() - if not options: sys.exit(0) - if logging_level: logging.getLogger('MARKDOWN').setLevel(logging_level) - - # Run - markdownFromFile(**options) +from markdown import commandline if __name__ == '__main__': - command_line_run() + commandline.run() -- cgit v1.2.3