From 8e7e2a254859f61ec2632a11725d99c5ea0c9f09 Mon Sep 17 00:00:00 2001 From: Yuri Takhteyev Date: Sun, 12 Oct 2008 18:02:07 -0700 Subject: More cleanup. Refactored all the core parsing logic into a separate class: MarkdownParser. --- markdown.py | 2648 ++++++++++++++++++++++++++++++----------------------------- 1 file changed, 1335 insertions(+), 1313 deletions(-) (limited to 'markdown.py') diff --git a/markdown.py b/markdown.py index 562380a..0aa530d 100755 --- a/markdown.py +++ b/markdown.py @@ -97,6 +97,24 @@ INLINE_PLACEHOLDER_PREFIX = STX+"klzzwxh:" INLINE_PLACEHOLDER = INLINE_PLACEHOLDER_PREFIX + "%s" + ETX AMP_SUBSTITUTE = STX+"amp"+ETX +def wrapRe(raw_re) : return re.compile("^%s$" % raw_re, re.DOTALL) +CORE_RE = { + 'header': wrapRe(r'(#{1,6})[ \t]*(.*?)[ \t]*(#*)'), # # A title + 'reference-def': wrapRe(r'(\ ?\ ?\ ?)\[([^\]]*)\]:\s*([^ ]*)(.*)'), + # [Google]: http://www.google.com/ + 'containsline': wrapRe(r'([-]*)$|^([=]*)'), # -----, =====, etc. + 'ol': wrapRe(r'[ ]{0,3}[\d]*\.\s+(.*)'), # 1. text + 'ul': wrapRe(r'[ ]{0,3}[*+-]\s+(.*)'), # "* text" + 'isline1': wrapRe(r'(\**)'), # *** + 'isline2': wrapRe(r'(\-*)'), # --- + 'isline3': wrapRe(r'(\_*)'), # ___ + 'tabbed': wrapRe(r'((\t)|( ))(.*)'), # an indented line + 'quoted': wrapRe(r'[ ]{0,2}> ?(.*)'), # a quoted block ("> ...") + 'containsline': re.compile(r'^([-]*)$|^([=]*)$', re.M), + 'attr': re.compile("\{@([^\}]*)=([^\}]*)}") # {@id=123} +} +"""Basic and reusable regular expressions.""" + """ AUXILIARY GLOBAL FUNCTIONS @@ -160,1385 +178,1419 @@ def dequote(string): """ -PRE-PROCESSORS +OVERALL DESIGN ============================================================================= -Preprocessors work on source text before we start doing anything too -complicated. There are two types of preprocessors: TextPreprocessor and -Preprocessor. -""" +Markdown processing takes place in three steps: -class TextPreprocessor: - """ - TextPreprocessors are run before the text is broken into lines. - - Each TextPreprocessor implements a "run" method that takes a pointer to a - text string of the document, modifies it as necessary and returns - either the same pointer or a pointer to a new string. - - TextPreprocessors must extend markdown.TextPreprocessor. +1. A bunch of "preprocessors" munge the input text. +2. MarkdownParser() parses the high-level structural elements of the + pre-processed text into an ElementTree. +3. A bunch of Patterns are run against the ElementTree, detecting inline + markup. +4. Some extra use-defined post-processors are run. +5. The output is written to a string. - """ +Those steps are put together by the Markdown() class. - def run(self, text): - """ - Each subclass of TextPreprocessor should override the `run` method, - which takes the document text as a single string and returns the - (possibly modified) document as a single string. - - """ - pass +The code below is organized as follows: +1. MarkdownParser class - does basic parsing. +2. All the post-processors, patterns, etc. +3. Markdown class - does the high-level wrapping. +""" -class Preprocessor: - """ - Preprocessors are run after the text is broken into lines. - Each preprocessor implements a "run" method that takes a pointer to a - list of lines of the document, modifies it as necessary and returns - either the same pointer or a pointer to a new list. - - Preprocessors must extend markdown.Preprocessor. - - """ +""" +CORE MARKDOWN PARSER +============================================================================= - def run(self, lines): - """ - Each subclass of Preprocessor should override the `run` method, which - takes the document as a list of strings split by newlines and returns - the (possibly modified) list of lines. +This class handles basic Markdown parsing. It doesn't concern itself with +inline elements such as **bold** or *italics*, but rather just catches blocks, +lists, quotes, etc. +""" - """ +class MarkdownParser: + """Parser Markdown into a ElementTree.""" + + def __init__(self): pass - -class HtmlBlockPreprocessor(TextPreprocessor): - """Remove html blocks from the text and store them for later retrieval.""" + def parseDocument(self, lines): + """Parse a markdown string into an ElementTree.""" + # Create a ElementTree from the lines + root = etree.Element("div") + buffer = [] + for line in lines: + if line.startswith("#"): + self.parseChunk(root, buffer) + buffer = [line] + else: + buffer.append(line) - right_tag_patterns = ["", "%s>"] + self.parseChunk(root, buffer) - def _get_left_tag(self, block): - return block[1:].replace(">", " ", 1).split()[0].lower() + return etree.ElementTree(root) - def _get_right_tag(self, left_tag, block): - for p in self.right_tag_patterns: - tag = p % left_tag - i = block.rfind(tag) - if i > 2: - return tag.lstrip("<").rstrip(">"), i + len(p)-2 + len(left_tag) - return block.rstrip()[-len(left_tag)-2:-1].lower(), len(block) - - def _equal_tags(self, left_tag, right_tag): - if left_tag == 'div' or left_tag[0] in ['?', '@', '%']: # handle PHP, etc. - return True - if ("/" + left_tag) == right_tag: - return True - if (right_tag == "--" and left_tag == "--"): - return True - elif left_tag == right_tag[1:] \ - and right_tag[0] != "<": - return True - else: - return False - def _is_oneliner(self, tag): - return (tag in ['hr', 'hr/']) + def parseChunk(self, parent_elem, lines, inList=0, looseList=0): + """Process a chunk of markdown-formatted text and attach the parse to + an ElementTree node. - def run(self, text): - new_blocks = [] - text = text.split("\n\n") - items = [] - left_tag = '' - right_tag = '' - in_tag = False # flag + Process a section of a source document, looking for high + level structural elements like lists, block quotes, code + segments, html blocks, etc. Some those then get stripped + of their high level markup (e.g. get unindented) and the + lower-level markup is processed recursively. - while text: - block = text[0] - if block.startswith("\n"): - block = block[1:] - text = text[1:] + Keyword arguments: + + * parent_elem: A ElementTree element to which the content will be added. + * lines: a list of lines + * inList: a level + + Returns: None + + """ + # Loop through lines until none left. + while lines: - if block.startswith("\n"): - block = block[1:] - - if not in_tag: - if block.startswith("<"): - left_tag = self._get_left_tag(block) - right_tag, data_index = self._get_right_tag(left_tag, block) - - if data_index < len(block): - text.insert(0, block[data_index:]) - block = block[:data_index] + # Skipping empty line + if not lines[0]: + lines = lines[1:] + continue + + # Check if this section starts with a list, a blockquote or + # a code block - if not (isBlockLevel(left_tag) \ - or block[1] in ["!", "?", "@", "%"]): - new_blocks.append(block) - continue + processFn = { 'ul': self._processUList, + 'ol': self._processOList, + 'quoted': self._processQuote, + 'tabbed': self._processCodeBlock} - if self._is_oneliner(left_tag): - new_blocks.append(block.strip()) - continue - - if block[1] == "!": - # is a comment block - left_tag = "--" - right_tag, data_index = self._get_right_tag(left_tag, block) - # keep checking conditions below and maybe just append - - if block.rstrip().endswith(">") \ - and self._equal_tags(left_tag, right_tag): - new_blocks.append( - self.stash.store(block.strip())) - continue - else: #if not block[1] == "!": - # if is block level tag and is not complete - - if isBlockLevel(left_tag) or left_tag == "--" \ - and not block.rstrip().endswith(">"): - items.append(block.strip()) - in_tag = True - else: - new_blocks.append( - self.stash.store(block.strip())) - - continue + for regexp in ['ul', 'ol', 'quoted', 'tabbed']: + m = CORE_RE[regexp].match(lines[0]) + if m: + processFn[regexp](parent_elem, lines, inList) + return - new_blocks.append(block) + # We are NOT looking at one of the high-level structures like + # lists or blockquotes. So, it's just a regular paragraph + # (though perhaps nested inside a list or something else). If + # we are NOT inside a list, we just need to look for a blank + # line to find the end of the block. If we ARE inside a + # list, however, we need to consider that a sublist does not + # need to be separated by a blank line. Rather, the following + # markup is legal: + # + # * The top level list item + # + # Another paragraph of the list. This is where we are now. + # * Underneath we might have a sublist. + # - else: - items.append(block.strip()) - - right_tag, data_index = self._get_right_tag(left_tag, block) - - if self._equal_tags(left_tag, right_tag): - # if find closing tag - in_tag = False - new_blocks.append( - self.stash.store('\n\n'.join(items))) - items = [] + if inList: - if items: - new_blocks.append(self.stash.store('\n\n'.join(items))) - new_blocks.append('\n') - - return "\n\n".join(new_blocks) + start, lines = self._linesUntil(lines, (lambda line: + CORE_RE['ul'].match(line) + or CORE_RE['ol'].match(line) + or not line.strip())) -HTML_BLOCK_PREPROCESSOR = HtmlBlockPreprocessor() + self.parseChunk(parent_elem, start, inList-1, looseList=looseList) + inList = inList-1 + else: # Ok, so it's just a simple block -class HeaderPreprocessor(Preprocessor): + paragraph, lines = self._linesUntil(lines, lambda line: + not line.strip() or line[0] == '>') - """Replace underlined headers with hashed headers. + if len(paragraph) and paragraph[0].startswith('#'): + self._processHeader(parent_elem, paragraph) + + elif len(paragraph) and \ + CORE_RE["isline3"].match(paragraph[0]): - (To avoid the need for lookahead later.) + self._processHR(parent_elem) + lines = paragraph[1:] + lines + + elif paragraph: + self._processParagraph(parent_elem, paragraph, + inList, looseList) - """ + if lines and not lines[0].strip(): + lines = lines[1:] # skip the first (blank) line - def run (self, lines): - i = -1 - while i+1 < len(lines): - i = i+1 - if not lines[i].strip(): - continue + def _processHR(self, parentElem): + hr = etree.SubElement(parentElem, "hr") + + def _processHeader(self, parentElem, paragraph): + m = CORE_RE['header'].match(paragraph[0]) + if m: + level = len(m.group(1)) + h = etree.SubElement(parentElem, "h%d" % level) + h.text = m.group(2).strip() + else: + message(CRITICAL, "We've got a problem header!") - if lines[i].startswith("#"): - lines.insert(i+1, "\n") - if (i+1 <= len(lines) - and lines[i+1] - and lines[i+1][0] in ['-', '=']): + def _processParagraph(self, parentElem, paragraph, inList, looseList): - underline = lines[i+1].strip() + if ( parentElem.tag == 'li' + and not (looseList or parentElem.getchildren())): - if underline == "="*len(underline): - lines[i] = "# " + lines[i].strip() - lines[i+1] = "" - elif underline == "-"*len(underline): - lines[i] = "## " + lines[i].strip() - lines[i+1] = "" - - return lines + # If this is the first paragraph inside "li", don't + # put

around it - append the paragraph bits directly + # onto parentElem + el = parentElem + else: + # Otherwise make a "p" element + el = etree.SubElement(parentElem, "p") -HEADER_PREPROCESSOR = HeaderPreprocessor() + dump = [] + + # Searching for hr or header + for line in paragraph: + # it's hr + if CORE_RE["isline3"].match(line): + el.text = "\n".join(dump) + self._processHR(el) + dump = [] + # it's header + elif line.startswith("#"): + el.text = "\n".join(dump) + self._processHeader(parentElem, [line]) + dump = [] + else: + dump.append(line) + if dump: + text = "\n".join(dump) + el.text = text + def _processUList(self, parentElem, lines, inList): + self._processList(parentElem, lines, inList, + listexpr='ul', tag = 'ul') -class LinePreprocessor(Preprocessor): - """Convert HR lines to "___" format.""" - blockquote_re = re.compile(r'^(> )+') + def _processOList(self, parentElem, lines, inList): + self._processList(parentElem, lines, inList, + listexpr='ol', tag = 'ol') - def run (self, lines): - for i in range(len(lines)): - prefix = '' - m = self.blockquote_re.search(lines[i]) - if m: - prefix = m.group(0) - if self._isLine(lines[i][len(prefix):]): - lines[i] = prefix + "___" - return lines - def _isLine(self, block): - """Determine if a block should be replaced with an


""" - if block.startswith(" "): - return False # a code block - text = "".join([x for x in block if not x.isspace()]) - if len(text) <= 2: - return False - for pattern in ['isline1', 'isline2', 'isline3']: - m = CORE_RE[pattern].match(text) - if (m and m.group(1)): - return True - else: - return False + def _processList(self, parentElem, lines, inList, listexpr, tag): + """ + Given a list of document lines starting with a list item, + finds the end of the list, breaks it up, and recursively + processes each list item and the remainder of the text file. -LINE_PREPROCESSOR = LinePreprocessor() + Keyword arguments: + + * parentElem: A ElementTree element to which the content will be added + * lines: a list of lines + * inList: a level + + Returns: None + + """ + ul = etree.SubElement(parentElem, tag) # ul might actually be '
    ' + looseList = 0 -class ReferencePreprocessor(Preprocessor): - """Remove reference definitions from the text and store them for later use.""" - def run (self, lines): - new_text = []; - for line in lines: - m = CORE_RE['reference-def'].match(line) - if m: - id = m.group(2).strip().lower() - t = m.group(4).strip() # potential title - if not t: - self.references[id] = (m.group(3), t) - elif (len(t) >= 2 - and (t[0] == t[-1] == "\"" - or t[0] == t[-1] == "\'" - or (t[0] == "(" and t[-1] == ")") ) ): - self.references[id] = (m.group(3), t[1:-1]) - else: - new_text.append(line) - else: - new_text.append(line) + # Make a list of list items + items = [] + item = -1 - return new_text #+ "\n" + i = 0 # a counter to keep track of where we are -REFERENCE_PREPROCESSOR = ReferencePreprocessor() + for line in lines: + loose = 0 + if not line.strip(): + # If we see a blank line, this _might_ be the end of the list + i += 1 + loose = 1 + # Find the next non-blank line + for j in range(i, len(lines)): + if lines[j].strip(): + next = lines[j] + break + else: + # There is no more text => end of the list + break + # Check if the next non-blank line is still a part of the list -""" -INLINE PATTERNS -============================================================================= + if ( CORE_RE[listexpr].match(next) or + CORE_RE['tabbed'].match(next) ): + # get rid of any white space in the line + items[item].append(line.strip()) + looseList = loose or looseList + continue + else: + break # found end of the list -Inline patterns such as *emphasis* are handled by means of auxiliary -objects, one per pattern. Pattern objects must be instances of classes -that extend markdown.Pattern. Each pattern object uses a single regular -expression and needs support the following methods: + # Now we need to detect list items (at the current level) + # while also detabing child elements if necessary - pattern.getCompiledRegExp() # returns a regular expression + for expr in ['ul', 'ol', 'tabbed']: - pattern.handleMatch(m) # takes a match object and returns - # an ElementTree element or just plain text + m = CORE_RE[expr].match(line) + if m: + if expr in ['ul', 'ol']: # We are looking at a new item + #if m.group(1) : + # Removed the check to allow for a blank line + # at the beginning of the list item + items.append([m.group(1)]) + item += 1 + elif expr == 'tabbed': # This line needs to be detabbed + items[item].append(m.group(4)) #after the 'tab' -All of python markdown's built-in patterns subclass from Pattern, -but you can add additional patterns that don't. + i += 1 + break + else: + items[item].append(line) # Just regular continuation + i += 1 # added on 2006.02.25 + else: + i += 1 -Also note that all the regular expressions used by inline must -capture the whole block. For this reason, they all start with -'^(.*)' and end with '(.*)!'. In case with built-in expression -Pattern takes care of adding the "^(.*)" and "(.*)!". + # Add the ElementTree elements + for item in items: + li = etree.SubElement(ul, "li") -Finally, the order in which regular expressions are applied is very -important - e.g. if we first replace http://.../ links with tags -and _then_ try to replace inline html, we would end up with a mess. -So, we apply the expressions in the following order: + self.parseChunk(li, item, inList + 1, looseList = looseList) -* escape and backticks have to go before everything else, so - that we can preempt any markdown patterns by escaping them. + # Process the remaining part of the section -* then we handle auto-links (must be done before inline html) + self.parseChunk(parentElem, lines[i:], inList) -* then we handle inline HTML. At this point we will simply - replace all inline HTML strings with a placeholder and add - the actual HTML to a hash. -* then inline images (must be done before links) + def _linesUntil(self, lines, condition): + """ + A utility function to break a list of lines upon the + first line that satisfied a condition. The condition + argument should be a predicate function. + + """ + i = -1 + for line in lines: + i += 1 + if condition(line): + break + else: + i += 1 + return lines[:i], lines[i:] -* then bracketed links, first regular then reference-style + def _processQuote(self, parentElem, lines, inList): + """ + Given a list of document lines starting with a quote finds + the end of the quote, unindents it and recursively + processes the body of the quote and the remainder of the + text file. -* finally we apply strong and emphasis -""" + Keyword arguments: + + * parentElem: ElementTree element to which the content will be added + * lines: a list of lines + * inList: a level + + Returns: None + + """ + dequoted = [] + i = 0 + blank_line = False # allow one blank line between paragraphs + for line in lines: + m = CORE_RE['quoted'].match(line) + if m: + dequoted.append(m.group(1)) + i += 1 + blank_line = False + elif not blank_line and line.strip() != '': + dequoted.append(line) + i += 1 + elif not blank_line and line.strip() == '': + dequoted.append(line) + i += 1 + blank_line = True + else: + break + blockquote = etree.SubElement(parentElem, "blockquote") -""" -The actual regular expressions for patterns ------------------------------------------------------------------------------ -""" + self.parseChunk(blockquote, dequoted, inList) + self.parseChunk(parentElem, lines[i:], inList) -NOBRACKET = r'[^\]\[]*' -BRK = ( r'\[(' - + (NOBRACKET + r'(\[')*6 - + (NOBRACKET+ r'\])*')*6 - + NOBRACKET + r')\]' ) -NOIMG = r'(?|((?:(?:\(.*?\))|[^\(\)]))*?)\s*((['"])(.*)\12)?\)''' # [text](url) or [text]() + def _processCodeBlock(self, parentElem, lines, inList): + """ + Given a list of document lines starting with a code block + finds the end of the block, puts it into the ElementTree verbatim + wrapped in ("
    ") and recursively processes the
    +        the remainder of the text file.
     
    -IMAGE_LINK_RE = r'\!' + BRK + r'\s*\((<.*?>|([^\)]*))\)' # ![alttxt](http://x.com/) or ![alttxt]()
    -REFERENCE_RE = NOIMG + BRK+ r'\s*\[([^\]]*)\]'           # [Google][3]
    -IMAGE_REFERENCE_RE = r'\!' + BRK + '\s*\[([^\]]*)\]' # ![alt text][2]
    -NOT_STRONG_RE = r'( \* )'                        # stand-alone * or _
    -AUTOLINK_RE = r'<((?:f|ht)tps?://[^>]*)>'        # 
    -AUTOMAIL_RE = r'<([^> \!]*@[^> ]*)>'               # 
    +        Keyword arguments:
    +        
    +        * parentElem: ElementTree element to which the content will be added
    +        * lines: a list of lines
    +        * inList: a level
    +        
    +        Returns: None
    +        
    +        """
    +        detabbed, theRest = self.detectTabbed(lines)
     
    -HTML_RE = r'(\<([a-zA-Z/][^\>]*?|\!--.*?--)\>)'               # <...>
    -ENTITY_RE = r'(&[\#a-zA-Z0-9]*;)'               # &
    -LINE_BREAK_RE = r'  \n'                     # two spaces at end of line
    -LINE_BREAK_2_RE = r'  $'                    # two spaces at end of text
    +        pre = etree.SubElement(parentElem, "pre")
    +        code = etree.SubElement(pre, "code")
    +        
    +        text = "\n".join(detabbed).rstrip()+"\n"
    +        code.text = AtomicString(text)
    +        self.parseChunk(parentElem, theRest, inList)        
     
    +    def detectTabbed(self, lines):
    +        """ Find indented text and remove indent before further proccesing.
     
    -"""
    -The pattern classes
    ------------------------------------------------------------------------------
    -"""
    +        Keyword arguments:
    +        
    +        * lines: an array of strings
    +        * fn: a function that returns a substring of a string
    +           if the string matches the necessary criteria
    +        
    +        Returns: a list of post processes items and the unused
    +        remainder of the original list
    +        
    +        """
    +        items = []
    +        item = -1
    +        i = 0 # to keep track of where we are
     
    -class Pattern:
    -    """Base class that inline patterns subclass. """
    +        def detab(line):
    +            match = CORE_RE['tabbed'].match(line)
    +            if match:
    +               return match.group(4)
     
    -    def __init__ (self, pattern):
    -        """
    -        Create an instant of an inline pattern.
    +        for line in lines:
    +            if line.strip(): # Non-blank line
    +                line = detab(line)
    +                if line:
    +                    items.append(line)
    +                    i += 1
    +                    continue
    +                else:
    +                    return items, lines[i:]
     
    -        Keyword arguments:
    +            else: # Blank line: _maybe_ we are done.
    +                i += 1 # advance
     
    -        * pattern: A regular expression that matches a pattern
    +                # Find the next non-blank line
    +                for j in range(i, len(lines)):  
    +                    if lines[j].strip():
    +                        next_line = lines[j]; break
    +                else:
    +                    break # There is no more text; we are done.
     
    -        """
    -        self.pattern = pattern
    -        self.compiled_re = re.compile("^(.*?)%s(.*?)$" % pattern, re.DOTALL)
    +                # Check if the next non-blank line is tabbed
    +                if detab(next_line): # Yes, more work to do.
    +                    items.append("")
    +                    continue
    +                else:
    +                    break # No, we are done.
    +        else:
    +            i += 1
     
    -        # Api for Markdown to pass safe_mode into instance
    -        self.safe_mode = False
    +        return items, lines[i:]
     
    -    def getCompiledRegExp (self):
    -        """ Return a compiled regular expression. """
    -        return self.compiled_re
     
    -    def handleMatch(self, m):
    -        """Return a ElementTree element from the given match.
     
    -        Subclasses should override this method.
     
    -        Keyword arguments:
    +"""
    +PRE-PROCESSORS
    +=============================================================================
     
    -        * m: A re match object containing a match of the pattern.
    +Preprocessors work on source text before we start doing anything too
    +complicated.  There are two types of preprocessors: TextPreprocessor and
    +Preprocessor.
    +"""
    +
    +class TextPreprocessor:
    +    """
    +    TextPreprocessors are run before the text is broken into lines.
    +    
    +    Each TextPreprocessor implements a "run" method that takes a pointer to a
    +    text string of the document, modifies it as necessary and returns
    +    either the same pointer or a pointer to a new string.  
    +    
    +    TextPreprocessors must extend markdown.TextPreprocessor.
    +
    +    """
     
    +    def run(self, text):
    +        """ 
    +        Each subclass of TextPreprocessor should override the `run` method, 
    +        which takes the document text as a single string and returns the 
    +        (possibly modified) document as a single string.
    +        
             """
             pass
    -    
    -    def type(self):
    -        """ Return class name, to define pattern type """
    -        return self.__class__.__name__
     
    -BasePattern = Pattern # for backward compatibility
     
    -class SimpleTextPattern (Pattern):
    -    """ Return a simple text of group(2) of a Pattern. """
    -    def handleMatch(self, m):
    -        text = m.group(2)
    -        if text == INLINE_PLACEHOLDER_PREFIX:
    -            return None
    -        return text
    +class Preprocessor:
    +    """
    +    Preprocessors are run after the text is broken into lines.
     
    -class SimpleTagPattern (Pattern):
    -    """ 
    -    Return element of type `tag` with a text attribute of group(3) 
    -    of a Pattern. 
    +    Each preprocessor implements a "run" method that takes a pointer to a
    +    list of lines of the document, modifies it as necessary and returns
    +    either the same pointer or a pointer to a new list.  
    +    
    +    Preprocessors must extend markdown.Preprocessor.
         
         """
    -    def __init__ (self, pattern, tag):
    -        Pattern.__init__(self, pattern)
    -        self.tag = tag
     
    -    def handleMatch(self, m):
    -        el = etree.Element(self.tag)
    -        el.text = m.group(3)
    -        return el
    -
    -class SubstituteTagPattern (SimpleTagPattern):
    -    """ Return a eLement of type `tag` with no children. """
    -    def handleMatch (self, m):
    -        return etree.Element(self.tag)
    +    def run(self, lines):
    +        """
    +        Each subclass of Preprocessor should override the `run` method, which
    +        takes the document as a list of strings split by newlines and returns
    +        the (possibly modified) list of lines.
     
    -class BacktickPattern (Pattern):
    -    """ Return a `` element containing the matching text. """
    -    def __init__ (self, pattern):
    -        Pattern.__init__(self, pattern)
    -        self.tag = "code"
    +        """
    +        pass
    + 
     
    -    def handleMatch(self, m):
    -        el = etree.Element(self.tag)
    -        el.text = AtomicString(m.group(3).strip())
    -        return el
    +class HtmlBlockPreprocessor(TextPreprocessor):
    +    """Remove html blocks from the text and store them for later retrieval."""
     
    +    right_tag_patterns = ["", "%s>"]
    +    
    +    def _get_left_tag(self, block):
    +        return block[1:].replace(">", " ", 1).split()[0].lower()
     
    -class DoubleTagPattern (SimpleTagPattern): 
    -    """Return a ElementTree element nested in tag2 nested in tag1.
    +    def _get_right_tag(self, left_tag, block):        
    +        for p in self.right_tag_patterns:
    +            tag = p % left_tag
    +            i = block.rfind(tag)
    +            if i > 2:
    +                return tag.lstrip("<").rstrip(">"), i + len(p)-2 + len(left_tag)
    +        return block.rstrip()[-len(left_tag)-2:-1].lower(), len(block)
     
    -    Useful for strong emphasis etc.
    +    def _equal_tags(self, left_tag, right_tag):
    +        if left_tag == 'div' or left_tag[0] in ['?', '@', '%']: # handle PHP, etc.
    +            return True
    +        if ("/" + left_tag) == right_tag:
    +            return True
    +        if (right_tag == "--" and left_tag == "--"):
    +            return True
    +        elif left_tag == right_tag[1:] \
    +            and right_tag[0] != "<":
    +            return True
    +        else:
    +            return False
     
    -    """
    -    def handleMatch(self, m):
    -        tag1, tag2 = self.tag.split(",")
    -        el1 = etree.Element(tag1)
    -        el2 = etree.SubElement(el1, tag2)
    -        el2.text = m.group(3)
    -        return el1
    +    def _is_oneliner(self, tag):
    +        return (tag in ['hr', 'hr/'])
     
    +    def run(self, text):
    +        new_blocks = []
    +        text = text.split("\n\n")        
    +        items = []
    +        left_tag = ''
    +        right_tag = ''
    +        in_tag = False # flag
     
    -class HtmlPattern (Pattern):
    -    """ Store raw inline html and return a placeholder. """
    -    def handleMatch (self, m):
    -        rawhtml = m.group(2)
    -        inline = True
    -        place_holder = self.stash.store(rawhtml)
    -        return place_holder
    +        while text:
    +            block = text[0]
    +            if block.startswith("\n"):
    +                block = block[1:]
    +            text = text[1:]
    +            
    +            if block.startswith("\n"):
    +                block = block[1:]
     
    +            if not in_tag:
    +                if block.startswith("<"):
    +                    left_tag = self._get_left_tag(block)
    +                    right_tag, data_index = self._get_right_tag(left_tag, block)
    +                    
    +                    if data_index < len(block):
    +                        text.insert(0, block[data_index:])
    +                        block = block[:data_index]
     
    +                    if not (isBlockLevel(left_tag) \
    +                        or block[1] in ["!", "?", "@", "%"]):
    +                        new_blocks.append(block)
    +                        continue
     
    -class LinkPattern (Pattern):
    -    """ Return a link element from the given match. """
    -    def handleMatch(self, m):
    -        el = etree.Element("a")
    -        el.text = m.group(2)
    -        title = m.group(11)
    -        href = m.group(9)
    +                    if self._is_oneliner(left_tag):
    +                        new_blocks.append(block.strip())
    +                        continue
    +                        
    +                    if block[1] == "!":
    +                        # is a comment block
    +                        left_tag = "--"
    +                        right_tag, data_index = self._get_right_tag(left_tag, block)
    +                        # keep checking conditions below and maybe just append
    +                        
    +                    if block.rstrip().endswith(">") \
    +                        and self._equal_tags(left_tag, right_tag):
    +                        new_blocks.append(
    +                            self.stash.store(block.strip()))
    +                        continue
    +                    else: #if not block[1] == "!":
    +                        # if is block level tag and is not complete
    +                        
    +                        if isBlockLevel(left_tag) or left_tag == "--" \
    +                        and not block.rstrip().endswith(">"):
    +                            items.append(block.strip())
    +                            in_tag = True
    +                        else:
    +                            new_blocks.append(
    +                            self.stash.store(block.strip()))
    +                            
    +                        continue
     
    -        if href:
    -            if href[0] == "<":
    -                href = href[1:-1]
    -            el.set("href", self.sanitize_url(href.strip()))
    -        else:
    -            el.set("href", "")
    -            
    -        if title:
    -            title = dequote(title) #.replace('"', """)
    -            el.set("title", title)
    -        return el
    +                new_blocks.append(block)
     
    -    def sanitize_url(self, url):
    -        """ 
    -        Sanitize a url against xss attacks in "safe_mode".
    +            else:
    +                items.append(block.strip())
    +                
    +                right_tag, data_index = self._get_right_tag(left_tag, block)
    +                
    +                if self._equal_tags(left_tag, right_tag):
    +                    # if find closing tag
    +                    in_tag = False
    +                    new_blocks.append(
    +                        self.stash.store('\n\n'.join(items)))
    +                    items = []
     
    -        Rather than specifically blacklisting `javascript:alert("XSS")` and all
    -        its aliases (see ), we whitelist known
    -        safe url formats. Most urls contain a network location, however some 
    -        are known not to (i.e.: mailto links). Script urls do not contain a 
    -        location. Additionally, for `javascript:...`, the scheme would be 
    -        "javascript" but some aliases will appear to `urlparse()` to have no 
    -        scheme. On top of that relative links (i.e.: "foo/bar.html") have no 
    -        scheme. Therefore we must check "path", "parameters", "query" and 
    -        "fragment" for any literal colons. We don't check "scheme" for colons 
    -        because it *should* never have any and "netloc" must allow the form:
    -        `username:password@host:port`.
    -        
    -        """
    -        locless_schemes = ['', 'mailto', 'news']
    -        scheme, netloc, path, params, query, fragment = url = urlparse(url)
    -        safe_url = False
    -        if netloc != '' or scheme in locless_schemes:
    -            safe_url = True
    +        if items:
    +            new_blocks.append(self.stash.store('\n\n'.join(items)))
    +            new_blocks.append('\n')
    +            
    +        return "\n\n".join(new_blocks)
     
    -        for part in url[2:]:
    -            if ":" in part:
    -                safe_url = False
    +HTML_BLOCK_PREPROCESSOR = HtmlBlockPreprocessor()
     
    -        if self.safe_mode and not safe_url:
    -            return ''
    -        else:
    -            return urlunparse(url)
     
    -class ImagePattern(LinkPattern):
    -    """ Return a img element from the given match. """
    -    def handleMatch(self, m):
    -        el = etree.Element("img")
    -        src_parts = m.group(9).split()
    -        if src_parts:
    -            src = src_parts[0]
    -            if src[0] == "<" and src[-1] == ">":
    -                src = src[1:-1]
    -            el.set('src', self.sanitize_url(src))
    -        else:
    -            el.set('src', "")
    -        if len(src_parts) > 1:
    -            el.set('title', dequote(" ".join(src_parts[1:])))
    -  
    -        if ENABLE_ATTRIBUTES:
    -            truealt = handleAttributes(m.group(2), el)
    -        else:
    -            truealt = m.group(2)
    -            
    -        el.set('alt', truealt)
    -        return el
    +class HeaderPreprocessor(Preprocessor):
     
    -class ReferencePattern(LinkPattern):
    -    """ Match to a stored reference and return link element. """
    -    def handleMatch(self, m):
    +    """Replace underlined headers with hashed headers.
     
    -        if m.group(9):
    -            id = m.group(9).lower()
    -        else:
    -            # if we got something like "[Google][]"
    -            # we'll use "google" as the id
    -            id = m.group(2).lower()
    +    (To avoid the need for lookahead later.)
     
    -        if not self.references.has_key(id): # ignore undefined refs
    -            return None
    -        href, title = self.references[id]
    +    """
     
    -        text = m.group(2)
    -        return self.makeTag(href, title, text)
    +    def run (self, lines):
    +        i = -1
    +        while i+1 < len(lines):
    +            i = i+1
    +            if not lines[i].strip():
    +                continue
     
    -    def makeTag(self, href, title, text):
    -        el = etree.Element('a')
    -        
    -        el.set('href', self.sanitize_url(href))
    -        if title:
    -            el.set('title', title)
    +            if lines[i].startswith("#"):
    +                lines.insert(i+1, "\n")
     
    -        el.text = text
    -        return el
    +            if (i+1 <= len(lines)
    +                  and lines[i+1]
    +                  and lines[i+1][0] in ['-', '=']):
     
    +                underline = lines[i+1].strip()
     
    -class ImageReferencePattern (ReferencePattern):
    -    """ Match to a stored reference and return img element. """
    -    def makeTag(self, href, title, text):
    -        el = etree.Element("img")
    -        el.set("src", self.sanitize_url(href))
    -        if title:
    -            el.set("title", title)
    -        el.set("alt", text)
    -        return el
    +                if underline == "="*len(underline):
    +                    lines[i] = "# " + lines[i].strip()
    +                    lines[i+1] = ""
    +                elif underline == "-"*len(underline):
    +                    lines[i] = "## " + lines[i].strip()
    +                    lines[i+1] = ""
     
    +        return lines
     
    -class AutolinkPattern (Pattern):
    -    """ Return a link Element given an autolink (``). """
    -    def handleMatch(self, m):
    -        el = etree.Element("a")
    -        el.set('href', m.group(2))
    -        el.text = AtomicString(m.group(2))
    -        return el
    +HEADER_PREPROCESSOR = HeaderPreprocessor()
     
    -class AutomailPattern (Pattern):
    -    """ 
    -    Return a mailto link Element given an automail link (``). 
    -    """
    -    def handleMatch(self, m):
    -        el = etree.Element('a')
    -        email = m.group(2)
    -        if email.startswith("mailto:"):
    -            email = email[len("mailto:"):]
     
    -        def codepoint2name(code):
    -            """Return entity definition by code, or the code if not defined."""
    -            entity = htmlentitydefs.codepoint2name.get(code)
    -            if entity:
    -                return "%s%s;" % (AMP_SUBSTITUTE, entity)
    -            else:
    -                return "%s#%d;" % (AMP_SUBSTITUTE, code)
    +class LinePreprocessor(Preprocessor):
    +    """Convert HR lines to "___" format."""
    +    blockquote_re = re.compile(r'^(> )+')
     
    -        letters = [codepoint2name(ord(letter)) for letter in email]
    -        el.text = AtomicString(''.join(letters))
    +    def run (self, lines):
    +        for i in range(len(lines)):
    +            prefix = ''
    +            m = self.blockquote_re.search(lines[i])
    +            if m: 
    +                prefix = m.group(0)
    +            if self._isLine(lines[i][len(prefix):]):
    +                lines[i] = prefix + "___"
    +        return lines
     
    -        mailto = "mailto:" + email
    -        mailto = "".join([AMP_SUBSTITUTE + '#%d;' % 
    -                          ord(letter) for letter in mailto])
    -        el.set('href', mailto)
    -        return el
    +    def _isLine(self, block):
    +        """Determine if a block should be replaced with an 
    """ + if block.startswith(" "): + return False # a code block + text = "".join([x for x in block if not x.isspace()]) + if len(text) <= 2: + return False + for pattern in ['isline1', 'isline2', 'isline3']: + m = CORE_RE[pattern].match(text) + if (m and m.group(1)): + return True + else: + return False -ESCAPE_PATTERN = SimpleTextPattern(ESCAPE_RE) -NOT_STRONG_PATTERN = SimpleTextPattern(NOT_STRONG_RE) +LINE_PREPROCESSOR = LinePreprocessor() -BACKTICK_PATTERN = BacktickPattern(BACKTICK_RE) -STRONG_PATTERN = SimpleTagPattern(STRONG_RE, 'strong') -EMPHASIS_PATTERN = SimpleTagPattern(EMPHASIS_RE, 'em') -EMPHASIS_PATTERN_2 = SimpleTagPattern(EMPHASIS_2_RE, 'em') -STRONG_EM_PATTERN = DoubleTagPattern(STRONG_EM_RE, 'strong,em') +class ReferencePreprocessor(Preprocessor): + """Remove reference definitions from the text and store them for later use.""" + def run (self, lines): + new_text = []; + for line in lines: + m = CORE_RE['reference-def'].match(line) + if m: + id = m.group(2).strip().lower() + t = m.group(4).strip() # potential title + if not t: + self.references[id] = (m.group(3), t) + elif (len(t) >= 2 + and (t[0] == t[-1] == "\"" + or t[0] == t[-1] == "\'" + or (t[0] == "(" and t[-1] == ")") ) ): + self.references[id] = (m.group(3), t[1:-1]) + else: + new_text.append(line) + else: + new_text.append(line) -LINE_BREAK_PATTERN = SubstituteTagPattern(LINE_BREAK_RE, 'br') -LINE_BREAK_PATTERN_2 = SubstituteTagPattern(LINE_BREAK_2_RE, 'br') + return new_text #+ "\n" -LINK_PATTERN = LinkPattern(LINK_RE) -IMAGE_LINK_PATTERN = ImagePattern(IMAGE_LINK_RE) -IMAGE_REFERENCE_PATTERN = ImageReferencePattern(IMAGE_REFERENCE_RE) -REFERENCE_PATTERN = ReferencePattern(REFERENCE_RE) +REFERENCE_PREPROCESSOR = ReferencePreprocessor() -HTML_PATTERN = HtmlPattern(HTML_RE) -ENTITY_PATTERN = HtmlPattern(ENTITY_RE) -AUTOLINK_PATTERN = AutolinkPattern(AUTOLINK_RE) -AUTOMAIL_PATTERN = AutomailPattern(AUTOMAIL_RE) """ -POST-PROCESSORS +INLINE PATTERNS ============================================================================= -Markdown also allows post-processors, which are similar to preprocessors in -that they need to implement a "run" method. However, they are run after core -processing. - -There are two types of post-processors: Postprocessor and TextPostprocessor -""" - -class Postprocessor: - """ - Postprocessors are run before the ElementTree serialization. - - Each Postprocessor implements a "run" method that takes a pointer to a - ElementTree, modifies it as necessary and returns a ElementTree - document. - - Postprocessors must extend markdown.Postprocessor. +Inline patterns such as *emphasis* are handled by means of auxiliary +objects, one per pattern. Pattern objects must be instances of classes +that extend markdown.Pattern. Each pattern object uses a single regular +expression and needs support the following methods: - """ - def run(self, root): - """ - Subclasses of Postprocessor should implement a `run` method, which - takes a root Element. Method can return another Element, and global - root Element will be replaced, or just modify current and return None. - """ - pass + pattern.getCompiledRegExp() # returns a regular expression + pattern.handleMatch(m) # takes a match object and returns + # an ElementTree element or just plain text -class TextPostprocessor: - """ - TextPostprocessors are run after the ElementTree it converted back into text. - - Each TextPostprocessor implements a "run" method that takes a pointer to a - text string, modifies it as necessary and returns a text string. - - TextPostprocessors must extend markdown.TextPostprocessor. - - """ +All of python markdown's built-in patterns subclass from Pattern, +but you can add additional patterns that don't. - def run(self, text): - """ - Subclasses of TextPostprocessor should implement a `run` method, which - takes the html document as a single text string and returns a - (possibly modified) string. +Also note that all the regular expressions used by inline must +capture the whole block. For this reason, they all start with +'^(.*)' and end with '(.*)!'. In case with built-in expression +Pattern takes care of adding the "^(.*)" and "(.*)!". - """ - pass +Finally, the order in which regular expressions are applied is very +important - e.g. if we first replace http://.../ links with
    tags +and _then_ try to replace inline html, we would end up with a mess. +So, we apply the expressions in the following order: +* escape and backticks have to go before everything else, so + that we can preempt any markdown patterns by escaping them. -class PrettifyPostprocessor(Postprocessor): - """Add linebreaks to the html document.""" - def _prettifyETree(self, elem): - """Recursively add linebreaks to ElementTree children.""" - i = "\n" - if isBlockLevel(elem.tag) and elem.tag not in ['code', 'pre']: - if (not elem.text or not elem.text.strip()) \ - and len(elem) and isBlockLevel(elem[0].tag): - elem.text = i - for e in elem: - if isBlockLevel(e.tag): - self._prettifyETree(e) - if not elem.tail or not elem.tail.strip(): - elem.tail = i - if not elem.tail or not elem.tail.strip(): - elem.tail = i +* then we handle auto-links (must be done before inline html) - def run(self, root): - """.Add linebreaks to ElementTree root object.""" - self._prettifyETree(root) - # Do
    's seperately as they are often in the middle of - # inline content and missed by _prettifyETree. - brs = root.getiterator('br') - for br in brs: - if not br.tail or not br.tail.strip(): - br.tail = '\n' - else: - br.tail = '\n%s' % br.tail +* then we handle inline HTML. At this point we will simply + replace all inline HTML strings with a placeholder and add + the actual HTML to a hash. -PRETTIFYPOSTPROCESSOR = PrettifyPostprocessor() +* then inline images (must be done before links) +* then bracketed links, first regular then reference-style -class RawHtmlTextPostprocessor(TextPostprocessor): - """ Restore raw html to the document. """ - def __init__(self): - pass +* finally we apply strong and emphasis +""" - def run(self, text): - """ Iterate over html stash and restore "safe" html. """ - for i in range(self.stash.html_counter): - html, safe = self.stash.rawHtmlBlocks[i] - if self.safeMode and not safe: - if str(self.safeMode).lower() == 'escape': - html = self.escape(html) - elif str(self.safeMode).lower() == 'remove': - html = '' - else: - html = HTML_REMOVED_TEXT - if safe or not self.safeMode: - text = text.replace("

    %s

    " % (HTML_PLACEHOLDER % i), - html + "\n") - text = text.replace(HTML_PLACEHOLDER % i, html) - return text - def escape(self, html): - """ Basic html escaping """ - html = html.replace('&', '&') - html = html.replace('<', '<') - html = html.replace('>', '>') - return html.replace('"', '"') +""" +The actual regular expressions for patterns +----------------------------------------------------------------------------- +""" -RAWHTMLTEXTPOSTPROCESSOR = RawHtmlTextPostprocessor() +NOBRACKET = r'[^\]\[]*' +BRK = ( r'\[(' + + (NOBRACKET + r'(\[')*6 + + (NOBRACKET+ r'\])*')*6 + + NOBRACKET + r')\]' ) +NOIMG = r'(?|((?:(?:\(.*?\))|[^\(\)]))*?)\s*((['"])(.*)\12)?\)''' # [text](url) or [text]() - text = text.replace(AMP_SUBSTITUTE, "&") - return text +IMAGE_LINK_RE = r'\!' + BRK + r'\s*\((<.*?>|([^\)]*))\)' # ![alttxt](http://x.com/) or ![alttxt]() +REFERENCE_RE = NOIMG + BRK+ r'\s*\[([^\]]*)\]' # [Google][3] +IMAGE_REFERENCE_RE = r'\!' + BRK + '\s*\[([^\]]*)\]' # ![alt text][2] +NOT_STRONG_RE = r'( \* )' # stand-alone * or _ +AUTOLINK_RE = r'<((?:f|ht)tps?://[^>]*)>' # +AUTOMAIL_RE = r'<([^> \!]*@[^> ]*)>' # -AMPSUBSTITUTETEXTPOSTPROCESSOR = AndSubstitutePostprocessor() +HTML_RE = r'(\<([a-zA-Z/][^\>]*?|\!--.*?--)\>)' # <...> +ENTITY_RE = r'(&[\#a-zA-Z0-9]*;)' # & +LINE_BREAK_RE = r' \n' # two spaces at end of line +LINE_BREAK_2_RE = r' $' # two spaces at end of text """ -MISC AUXILIARY CLASSES -============================================================================= +The pattern classes +----------------------------------------------------------------------------- """ -class AtomicString(unicode): - """A string which should not be further processed.""" - pass +class Pattern: + """Base class that inline patterns subclass. """ + def __init__ (self, pattern): + """ + Create an instant of an inline pattern. -class HtmlStash: - """ - This class is used for stashing HTML objects that we extract - in the beginning and replace with place-holders. - """ + Keyword arguments: - def __init__ (self): - """ Create a HtmlStash. """ - self.html_counter = 0 # for counting inline html segments - self.rawHtmlBlocks=[] + * pattern: A regular expression that matches a pattern - def store(self, html, safe=False): """ - Saves an HTML segment for later reinsertion. Returns a - placeholder string that needs to be inserted into the - document. + self.pattern = pattern + self.compiled_re = re.compile("^(.*?)%s(.*?)$" % pattern, re.DOTALL) + + # Api for Markdown to pass safe_mode into instance + self.safe_mode = False + + def getCompiledRegExp (self): + """ Return a compiled regular expression. """ + return self.compiled_re + + def handleMatch(self, m): + """Return a ElementTree element from the given match. + + Subclasses should override this method. Keyword arguments: - - * html: an html segment - * safe: label an html segment as safe for safemode - - Returns : a placeholder string - + + * m: A re match object containing a match of the pattern. + """ - self.rawHtmlBlocks.append((html, safe)) - placeholder = HTML_PLACEHOLDER % self.html_counter - self.html_counter += 1 - return placeholder + pass - def rest(self): - self.html_counter = 0 - self.rawHtmlBlocks = [] + def type(self): + """ Return class name, to define pattern type """ + return self.__class__.__name__ +BasePattern = Pattern # for backward compatibility + +class SimpleTextPattern (Pattern): + """ Return a simple text of group(2) of a Pattern. """ + def handleMatch(self, m): + text = m.group(2) + if text == INLINE_PLACEHOLDER_PREFIX: + return None + return text + +class SimpleTagPattern (Pattern): + """ + Return element of type `tag` with a text attribute of group(3) + of a Pattern. -class InlineStash: - - def __init__(self): - """ Create a InlineStash. """ - self.prefix = INLINE_PLACEHOLDER_PREFIX - self.suffix = ETX - self._nodes = {} - self.phLength = 4 + len(self.prefix) + len(self.suffix) - self._placeholder_re = re.compile(INLINE_PLACEHOLDER % r'([0-9]{4})') - - def _genPlaceholder(self, type): - """ Generate a placeholder """ - id = "%04d" % len(self._nodes) - hash = INLINE_PLACEHOLDER % id - return hash, id - - def extractId(self, data, index): - """ - Extract id from data string, start from index - - Keyword arguments: - - * data: string - * index: index, from which we start search - - Returns: placeholder id and string index, after - found placeholder - """ - m = self._placeholder_re.search(data, index) - if m: - return m.group(1), m.end() - else: - return None, index + 1 - - def isin(self, id): - """ Check if node with given id exists in stash """ - return self._nodes.has_key(id) - - def get(self, id): - """ Return node by id """ - return self._nodes.get(id) - - def add(self, node, type): - """ Add node to stash """ - pholder, id = self._genPlaceholder(type) - self._nodes[id] = node - return pholder - - def rest(self): - """ Reset instance """ - self._nodes = {} - -""" -CORE MARKDOWN -============================================================================= - -The core part is still quite messy, despite substantial refactoring. If you -are thinking of extending the syntax, see first if you can do it through -pre-processors, post-processors, inline patterns or a combination of the three. -""" - -def _wrapRe(raw_re) : return re.compile("^%s$" % raw_re, re.DOTALL) -CORE_RE = { - 'header': _wrapRe(r'(#{1,6})[ \t]*(.*?)[ \t]*(#*)'), # # A title - 'reference-def': _wrapRe(r'(\ ?\ ?\ ?)\[([^\]]*)\]:\s*([^ ]*)(.*)'), - # [Google]: http://www.google.com/ - 'containsline': _wrapRe(r'([-]*)$|^([=]*)'), # -----, =====, etc. - 'ol': _wrapRe(r'[ ]{0,3}[\d]*\.\s+(.*)'), # 1. text - 'ul': _wrapRe(r'[ ]{0,3}[*+-]\s+(.*)'), # "* text" - 'isline1': _wrapRe(r'(\**)'), # *** - 'isline2': _wrapRe(r'(\-*)'), # --- - 'isline3': _wrapRe(r'(\_*)'), # ___ - 'tabbed': _wrapRe(r'((\t)|( ))(.*)'), # an indented line - 'quoted': _wrapRe(r'[ ]{0,2}> ?(.*)'), # a quoted block ("> ...") - 'containsline': re.compile(r'^([-]*)$|^([=]*)$', re.M), - 'attr': re.compile("\{@([^\}]*)=([^\}]*)}") # {@id=123} -} + """ + def __init__ (self, pattern, tag): + Pattern.__init__(self, pattern) + self.tag = tag + def handleMatch(self, m): + el = etree.Element(self.tag) + el.text = m.group(3) + return el -class Markdown: - """Converts markdown to HTML.""" +class SubstituteTagPattern (SimpleTagPattern): + """ Return a eLement of type `tag` with no children. """ + def handleMatch (self, m): + return etree.Element(self.tag) - def __init__(self, - extensions=[], - extension_configs={}, - safe_mode = False): - """ - Creates a new Markdown instance. +class BacktickPattern (Pattern): + """ Return a `` element containing the matching text. """ + def __init__ (self, pattern): + Pattern.__init__(self, pattern) + self.tag = "code" - Keyword arguments: - - * extensions: A list of extensions. - If they are of type string, the module mdx_name.py will be loaded. - If they are a subclass of markdown.Extension, they will be used - as-is. - * extension-configs: Configuration setting for extensions. - * safe_mode: Disallow raw html. One of "remove", "replace" or "escape". - - """ - self.source = None - self.safeMode = safe_mode - self.registeredExtensions = [] - self.docType = "" - self.stripTopLevelTags = True + def handleMatch(self, m): + el = etree.Element(self.tag) + el.text = AtomicString(m.group(3).strip()) + return el - self.textPreprocessors = [HTML_BLOCK_PREPROCESSOR] - self.preprocessors = [HEADER_PREPROCESSOR, - LINE_PREPROCESSOR, - # A footnote preprocessor will - # get inserted here - REFERENCE_PREPROCESSOR] +class DoubleTagPattern (SimpleTagPattern): + """Return a ElementTree element nested in tag2 nested in tag1. + Useful for strong emphasis etc. - self.postprocessors = [PRETTIFYPOSTPROCESSOR, - # a footnote postprocessor will get - # inserted later - ] + """ + def handleMatch(self, m): + tag1, tag2 = self.tag.split(",") + el1 = etree.Element(tag1) + el2 = etree.SubElement(el1, tag2) + el2.text = m.group(3) + return el1 - self.textPostprocessors = [# a footnote postprocessor will get - # inserted here - RAWHTMLTEXTPOSTPROCESSOR, - AMPSUBSTITUTETEXTPOSTPROCESSOR] - self.prePatterns = [] - - self.inlinePatterns = [ - BACKTICK_PATTERN, - ESCAPE_PATTERN, - REFERENCE_PATTERN, - LINK_PATTERN, - IMAGE_LINK_PATTERN, - IMAGE_REFERENCE_PATTERN, - AUTOLINK_PATTERN, - AUTOMAIL_PATTERN, - LINE_BREAK_PATTERN_2, - LINE_BREAK_PATTERN, - HTML_PATTERN, - ENTITY_PATTERN, - NOT_STRONG_PATTERN, - STRONG_EM_PATTERN, - STRONG_PATTERN, - EMPHASIS_PATTERN, - EMPHASIS_PATTERN_2 - # The order of the handlers matters!!! - ] - - self.inlineStash = InlineStash() - self.references = {} - self.htmlStash = HtmlStash() +class HtmlPattern (Pattern): + """ Store raw inline html and return a placeholder. """ + def handleMatch (self, m): + rawhtml = m.group(2) + inline = True + place_holder = self.stash.store(rawhtml) + return place_holder - self.registerExtensions(extensions = extensions, - configs = extension_configs) - self.reset() +class LinkPattern (Pattern): + """ Return a link element from the given match. """ + def handleMatch(self, m): + el = etree.Element("a") + el.text = m.group(2) + title = m.group(11) + href = m.group(9) + if href: + if href[0] == "<": + href = href[1:-1] + el.set("href", self.sanitize_url(href.strip())) + else: + el.set("href", "") + + if title: + title = dequote(title) #.replace('"', """) + el.set("title", title) + return el - def registerExtensions(self, extensions, configs): + def sanitize_url(self, url): """ - Register extensions with this instance of Markdown. + Sanitize a url against xss attacks in "safe_mode". - Keyword aurguments: - - * extensions: A list of extensions, which can either - be strings or objects. See the docstring on Markdown. - * configs: A dictionary mapping module names to config options. + Rather than specifically blacklisting `javascript:alert("XSS")` and all + its aliases (see ), we whitelist known + safe url formats. Most urls contain a network location, however some + are known not to (i.e.: mailto links). Script urls do not contain a + location. Additionally, for `javascript:...`, the scheme would be + "javascript" but some aliases will appear to `urlparse()` to have no + scheme. On top of that relative links (i.e.: "foo/bar.html") have no + scheme. Therefore we must check "path", "parameters", "query" and + "fragment" for any literal colons. We don't check "scheme" for colons + because it *should* never have any and "netloc" must allow the form: + `username:password@host:port`. """ - for ext in extensions: - if isinstance(ext, basestring): - ext = load_extension(ext, configs.get(ext, [])) - elif hasattr(ext, 'extendMarkdown'): - # Looks like an Extension. - # Nothing to do here. - pass - else: - message(ERROR, "Incorrect type! Extension '%s' is " - "neither a string or an Extension." %(repr(ext))) - continue - ext.extendMarkdown(self, globals()) + locless_schemes = ['', 'mailto', 'news'] + scheme, netloc, path, params, query, fragment = url = urlparse(url) + safe_url = False + if netloc != '' or scheme in locless_schemes: + safe_url = True - def registerExtension(self, extension): - """ This gets called by the extension """ - self.registeredExtensions.append(extension) + for part in url[2:]: + if ":" in part: + safe_url = False - def reset(self): - """ - Resets all state variables so that we can start with a new text. - """ - self.inlineStash.rest() - self.htmlStash.rest() - self.references.clear() + if self.safe_mode and not safe_url: + return '' + else: + return urlunparse(url) - HTML_BLOCK_PREPROCESSOR.stash = self.htmlStash - LINE_PREPROCESSOR.stash = self.htmlStash - REFERENCE_PREPROCESSOR.references = self.references - HTML_PATTERN.stash = self.htmlStash - ENTITY_PATTERN.stash = self.htmlStash - REFERENCE_PATTERN.references = self.references - IMAGE_REFERENCE_PATTERN.references = self.references - RAWHTMLTEXTPOSTPROCESSOR.stash = self.htmlStash - RAWHTMLTEXTPOSTPROCESSOR.safeMode = self.safeMode +class ImagePattern(LinkPattern): + """ Return a img element from the given match. """ + def handleMatch(self, m): + el = etree.Element("img") + src_parts = m.group(9).split() + if src_parts: + src = src_parts[0] + if src[0] == "<" and src[-1] == ">": + src = src[1:-1] + el.set('src', self.sanitize_url(src)) + else: + el.set('src', "") + if len(src_parts) > 1: + el.set('title', dequote(" ".join(src_parts[1:]))) + + if ENABLE_ATTRIBUTES: + truealt = handleAttributes(m.group(2), el) + else: + truealt = m.group(2) + + el.set('alt', truealt) + return el - for extension in self.registeredExtensions: - extension.reset() +class ReferencePattern(LinkPattern): + """ Match to a stored reference and return link element. """ + def handleMatch(self, m): - for pattern in self.inlinePatterns: - pattern.safe_mode = self.safeMode + if m.group(9): + id = m.group(9).lower() + else: + # if we got something like "[Google][]" + # we'll use "google" as the id + id = m.group(2).lower() - def _processSection(self, parent_elem, lines, - inList=0, looseList=0): - """ - Process a section of a source document, looking for high - level structural elements like lists, block quotes, code - segments, html blocks, etc. Some those then get stripped - of their high level markup (e.g. get unindented) and the - lower-level markup is processed recursively. + if not self.references.has_key(id): # ignore undefined refs + return None + href, title = self.references[id] - Keyword arguments: - - * parent_elem: A ElementTree element to which the content will be added. - * lines: a list of lines - * inList: a level - - Returns: None + text = m.group(2) + return self.makeTag(href, title, text) + + def makeTag(self, href, title, text): + el = etree.Element('a') - """ - # Loop through lines until none left. - while lines: - - # Skipping empty line - if not lines[0]: - lines = lines[1:] - continue - - # Check if this section starts with a list, a blockquote or - # a code block + el.set('href', self.sanitize_url(href)) + if title: + el.set('title', title) - processFn = { 'ul': self._processUList, - 'ol': self._processOList, - 'quoted': self._processQuote, - 'tabbed': self._processCodeBlock} + el.text = text + return el - for regexp in ['ul', 'ol', 'quoted', 'tabbed']: - m = CORE_RE[regexp].match(lines[0]) - if m: - processFn[regexp](parent_elem, lines, inList) - return - # We are NOT looking at one of the high-level structures like - # lists or blockquotes. So, it's just a regular paragraph - # (though perhaps nested inside a list or something else). If - # we are NOT inside a list, we just need to look for a blank - # line to find the end of the block. If we ARE inside a - # list, however, we need to consider that a sublist does not - # need to be separated by a blank line. Rather, the following - # markup is legal: - # - # * The top level list item - # - # Another paragraph of the list. This is where we are now. - # * Underneath we might have a sublist. - # +class ImageReferencePattern (ReferencePattern): + """ Match to a stored reference and return img element. """ + def makeTag(self, href, title, text): + el = etree.Element("img") + el.set("src", self.sanitize_url(href)) + if title: + el.set("title", title) + el.set("alt", text) + return el + + +class AutolinkPattern (Pattern): + """ Return a link Element given an autolink (``). """ + def handleMatch(self, m): + el = etree.Element("a") + el.set('href', m.group(2)) + el.text = AtomicString(m.group(2)) + return el + +class AutomailPattern (Pattern): + """ + Return a mailto link Element given an automail link (``). + """ + def handleMatch(self, m): + el = etree.Element('a') + email = m.group(2) + if email.startswith("mailto:"): + email = email[len("mailto:"):] - if inList: + def codepoint2name(code): + """Return entity definition by code, or the code if not defined.""" + entity = htmlentitydefs.codepoint2name.get(code) + if entity: + return "%s%s;" % (AMP_SUBSTITUTE, entity) + else: + return "%s#%d;" % (AMP_SUBSTITUTE, code) - start, lines = self._linesUntil(lines, (lambda line: - CORE_RE['ul'].match(line) - or CORE_RE['ol'].match(line) - or not line.strip())) + letters = [codepoint2name(ord(letter)) for letter in email] + el.text = AtomicString(''.join(letters)) - self._processSection(parent_elem, start, - inList - 1, looseList = looseList) - inList = inList-1 + mailto = "mailto:" + email + mailto = "".join([AMP_SUBSTITUTE + '#%d;' % + ord(letter) for letter in mailto]) + el.set('href', mailto) + return el - else: # Ok, so it's just a simple block +ESCAPE_PATTERN = SimpleTextPattern(ESCAPE_RE) +NOT_STRONG_PATTERN = SimpleTextPattern(NOT_STRONG_RE) - paragraph, lines = self._linesUntil(lines, lambda line: - not line.strip() or line[0] == '>') +BACKTICK_PATTERN = BacktickPattern(BACKTICK_RE) +STRONG_PATTERN = SimpleTagPattern(STRONG_RE, 'strong') +EMPHASIS_PATTERN = SimpleTagPattern(EMPHASIS_RE, 'em') +EMPHASIS_PATTERN_2 = SimpleTagPattern(EMPHASIS_2_RE, 'em') - if len(paragraph) and paragraph[0].startswith('#'): - self._processHeader(parent_elem, paragraph) - - elif len(paragraph) and \ - CORE_RE["isline3"].match(paragraph[0]): +STRONG_EM_PATTERN = DoubleTagPattern(STRONG_EM_RE, 'strong,em') - self._processHR(parent_elem) - lines = paragraph[1:] + lines - - elif paragraph: - self._processParagraph(parent_elem, paragraph, - inList, looseList) +LINE_BREAK_PATTERN = SubstituteTagPattern(LINE_BREAK_RE, 'br') +LINE_BREAK_PATTERN_2 = SubstituteTagPattern(LINE_BREAK_2_RE, 'br') - if lines and not lines[0].strip(): - lines = lines[1:] # skip the first (blank) line +LINK_PATTERN = LinkPattern(LINK_RE) +IMAGE_LINK_PATTERN = ImagePattern(IMAGE_LINK_RE) +IMAGE_REFERENCE_PATTERN = ImageReferencePattern(IMAGE_REFERENCE_RE) +REFERENCE_PATTERN = ReferencePattern(REFERENCE_RE) - def _processHR(self, parentElem): - hr = etree.SubElement(parentElem, "hr") - - def _processHeader(self, parentElem, paragraph): - m = CORE_RE['header'].match(paragraph[0]) - if m: - level = len(m.group(1)) - h = etree.SubElement(parentElem, "h%d" % level) - h.text = m.group(2).strip() - else: - message(CRITICAL, "We've got a problem header!") +HTML_PATTERN = HtmlPattern(HTML_RE) +ENTITY_PATTERN = HtmlPattern(ENTITY_RE) +AUTOLINK_PATTERN = AutolinkPattern(AUTOLINK_RE) +AUTOMAIL_PATTERN = AutomailPattern(AUTOMAIL_RE) - def _processParagraph(self, parentElem, paragraph, inList, looseList): - if ( parentElem.tag == 'li' - and not (looseList or parentElem.getchildren())): +""" +POST-PROCESSORS +============================================================================= - # If this is the first paragraph inside "li", don't - # put

    around it - append the paragraph bits directly - # onto parentElem - el = parentElem - else: - # Otherwise make a "p" element - el = etree.SubElement(parentElem, "p") +Markdown also allows post-processors, which are similar to preprocessors in +that they need to implement a "run" method. However, they are run after core +processing. - dump = [] - - # Searching for hr or header - for line in paragraph: - # it's hr - if CORE_RE["isline3"].match(line): - el.text = "\n".join(dump) - self._processHR(el) - dump = [] - # it's header - elif line.startswith("#"): - el.text = "\n".join(dump) - self._processHeader(parentElem, [line]) - dump = [] - else: - dump.append(line) - if dump: - text = "\n".join(dump) - el.text = text +There are two types of post-processors: Postprocessor and TextPostprocessor +""" - def _processUList(self, parentElem, lines, inList): - self._processList(parentElem, lines, inList, - listexpr='ul', tag = 'ul') +class Postprocessor: + """ + Postprocessors are run before the ElementTree serialization. + + Each Postprocessor implements a "run" method that takes a pointer to a + ElementTree, modifies it as necessary and returns a ElementTree + document. + + Postprocessors must extend markdown.Postprocessor. - def _processOList(self, parentElem, lines, inList): - self._processList(parentElem, lines, inList, - listexpr='ol', tag = 'ol') + """ + def run(self, root): + """ + Subclasses of Postprocessor should implement a `run` method, which + takes a root Element. Method can return another Element, and global + root Element will be replaced, or just modify current and return None. + """ + pass - def _processList(self, parentElem, lines, inList, listexpr, tag): +class TextPostprocessor: + """ + TextPostprocessors are run after the ElementTree it converted back into text. + + Each TextPostprocessor implements a "run" method that takes a pointer to a + text string, modifies it as necessary and returns a text string. + + TextPostprocessors must extend markdown.TextPostprocessor. + + """ + + def run(self, text): """ - Given a list of document lines starting with a list item, - finds the end of the list, breaks it up, and recursively - processes each list item and the remainder of the text file. + Subclasses of TextPostprocessor should implement a `run` method, which + takes the html document as a single text string and returns a + (possibly modified) string. - Keyword arguments: - - * parentElem: A ElementTree element to which the content will be added - * lines: a list of lines - * inList: a level - - Returns: None - """ - ul = etree.SubElement(parentElem, tag) # ul might actually be '

      ' + pass - looseList = 0 - # Make a list of list items - items = [] - item = -1 +class PrettifyPostprocessor(Postprocessor): + """Add linebreaks to the html document.""" + def _prettifyETree(self, elem): + """Recursively add linebreaks to ElementTree children.""" + i = "\n" + if isBlockLevel(elem.tag) and elem.tag not in ['code', 'pre']: + if (not elem.text or not elem.text.strip()) \ + and len(elem) and isBlockLevel(elem[0].tag): + elem.text = i + for e in elem: + if isBlockLevel(e.tag): + self._prettifyETree(e) + if not elem.tail or not elem.tail.strip(): + elem.tail = i + if not elem.tail or not elem.tail.strip(): + elem.tail = i - i = 0 # a counter to keep track of where we are + def run(self, root): + """.Add linebreaks to ElementTree root object.""" + self._prettifyETree(root) + # Do
      's seperately as they are often in the middle of + # inline content and missed by _prettifyETree. + brs = root.getiterator('br') + for br in brs: + if not br.tail or not br.tail.strip(): + br.tail = '\n' + else: + br.tail = '\n%s' % br.tail - for line in lines: +PRETTIFYPOSTPROCESSOR = PrettifyPostprocessor() - loose = 0 - if not line.strip(): - # If we see a blank line, this _might_ be the end of the list - i += 1 - loose = 1 - # Find the next non-blank line - for j in range(i, len(lines)): - if lines[j].strip(): - next = lines[j] - break +class RawHtmlTextPostprocessor(TextPostprocessor): + """ Restore raw html to the document. """ + def __init__(self): + pass + + def run(self, text): + """ Iterate over html stash and restore "safe" html. """ + for i in range(self.stash.html_counter): + html, safe = self.stash.rawHtmlBlocks[i] + if self.safeMode and not safe: + if str(self.safeMode).lower() == 'escape': + html = self.escape(html) + elif str(self.safeMode).lower() == 'remove': + html = '' else: - # There is no more text => end of the list - break + html = HTML_REMOVED_TEXT + if safe or not self.safeMode: + text = text.replace("

      %s

      " % (HTML_PLACEHOLDER % i), + html + "\n") + text = text.replace(HTML_PLACEHOLDER % i, html) + return text - # Check if the next non-blank line is still a part of the list + def escape(self, html): + """ Basic html escaping """ + html = html.replace('&', '&') + html = html.replace('<', '<') + html = html.replace('>', '>') + return html.replace('"', '"') - if ( CORE_RE[listexpr].match(next) or - CORE_RE['tabbed'].match(next) ): - # get rid of any white space in the line - items[item].append(line.strip()) - looseList = loose or looseList - continue - else: - break # found end of the list +RAWHTMLTEXTPOSTPROCESSOR = RawHtmlTextPostprocessor() - # Now we need to detect list items (at the current level) - # while also detabing child elements if necessary - for expr in ['ul', 'ol', 'tabbed']: +class AndSubstitutePostprocessor(TextPostprocessor): + """ Restore valid entities """ + def __init__(self): + pass - m = CORE_RE[expr].match(line) - if m: - if expr in ['ul', 'ol']: # We are looking at a new item - #if m.group(1) : - # Removed the check to allow for a blank line - # at the beginning of the list item - items.append([m.group(1)]) - item += 1 - elif expr == 'tabbed': # This line needs to be detabbed - items[item].append(m.group(4)) #after the 'tab' + def run(self, text): - i += 1 - break - else: - items[item].append(line) # Just regular continuation - i += 1 # added on 2006.02.25 - else: - i += 1 + text = text.replace(AMP_SUBSTITUTE, "&") + return text - # Add the ElementTree elements - for item in items: - li = etree.SubElement(ul, "li") +AMPSUBSTITUTETEXTPOSTPROCESSOR = AndSubstitutePostprocessor() - self._processSection(li, item, inList + 1, looseList = looseList) - # Process the remaining part of the section +""" +MISC AUXILIARY CLASSES +============================================================================= +""" - self._processSection(parentElem, lines[i:], inList) +class AtomicString(unicode): + """A string which should not be further processed.""" + pass - def _linesUntil(self, lines, condition): - """ - A utility function to break a list of lines upon the - first line that satisfied a condition. The condition - argument should be a predicate function. - - """ - i = -1 - for line in lines: - i += 1 - if condition(line): - break - else: - i += 1 - return lines[:i], lines[i:] +class HtmlStash: + """ + This class is used for stashing HTML objects that we extract + in the beginning and replace with place-holders. + """ - def _processQuote(self, parentElem, lines, inList): + def __init__ (self): + """ Create a HtmlStash. """ + self.html_counter = 0 # for counting inline html segments + self.rawHtmlBlocks=[] + + def store(self, html, safe=False): """ - Given a list of document lines starting with a quote finds - the end of the quote, unindents it and recursively - processes the body of the quote and the remainder of the - text file. + Saves an HTML segment for later reinsertion. Returns a + placeholder string that needs to be inserted into the + document. Keyword arguments: - * parentElem: ElementTree element to which the content will be added - * lines: a list of lines - * inList: a level + * html: an html segment + * safe: label an html segment as safe for safemode - Returns: None + Returns : a placeholder string """ - dequoted = [] - i = 0 - blank_line = False # allow one blank line between paragraphs - for line in lines: - m = CORE_RE['quoted'].match(line) - if m: - dequoted.append(m.group(1)) - i += 1 - blank_line = False - elif not blank_line and line.strip() != '': - dequoted.append(line) - i += 1 - elif not blank_line and line.strip() == '': - dequoted.append(line) - i += 1 - blank_line = True - else: - break - - blockquote = etree.SubElement(parentElem, "blockquote") - - self._processSection(blockquote, dequoted, inList) - self._processSection(parentElem, lines[i:], inList) - + self.rawHtmlBlocks.append((html, safe)) + placeholder = HTML_PLACEHOLDER % self.html_counter + self.html_counter += 1 + return placeholder + + def rest(self): + self.html_counter = 0 + self.rawHtmlBlocks = [] + +class InlineStash: + + def __init__(self): + """ Create a InlineStash. """ + self.prefix = INLINE_PLACEHOLDER_PREFIX + self.suffix = ETX + self._nodes = {} + self.phLength = 4 + len(self.prefix) + len(self.suffix) + self._placeholder_re = re.compile(INLINE_PLACEHOLDER % r'([0-9]{4})') + + def _genPlaceholder(self, type): + """ Generate a placeholder """ + id = "%04d" % len(self._nodes) + hash = INLINE_PLACEHOLDER % id + return hash, id + + def extractId(self, data, index): + """ + Extract id from data string, start from index + + Keyword arguments: + + * data: string + * index: index, from which we start search + + Returns: placeholder id and string index, after + found placeholder + """ + m = self._placeholder_re.search(data, index) + if m: + return m.group(1), m.end() + else: + return None, index + 1 + + def isin(self, id): + """ Check if node with given id exists in stash """ + return self._nodes.has_key(id) + + def get(self, id): + """ Return node by id """ + return self._nodes.get(id) + + def add(self, node, type): + """ Add node to stash """ + pholder, id = self._genPlaceholder(type) + self._nodes[id] = node + return pholder + + def rest(self): + """ Reset instance """ + self._nodes = {} + + +class Markdown: + """Convert Markdown to HTML.""" - def _processCodeBlock(self, parentElem, lines, inList): + def __init__(self, + extensions=[], + extension_configs={}, + safe_mode = False): """ - Given a list of document lines starting with a code block - finds the end of the block, puts it into the ElementTree verbatim - wrapped in ("
      ") and recursively processes the
      -        the remainder of the text file.
      +        Creates a new Markdown instance.
       
               Keyword arguments:
               
      -        * parentElem: ElementTree element to which the content will be added
      -        * lines: a list of lines
      -        * inList: a level
      -        
      -        Returns: None
      +        * extensions: A list of extensions.  
      +           If they are of type string, the module mdx_name.py will be loaded.  
      +           If they are a subclass of markdown.Extension, they will be used 
      +           as-is.
      +        * extension-configs: Configuration setting for extensions.
      +        * safe_mode: Disallow raw html. One of "remove", "replace" or "escape".
               
               """
      -        detabbed, theRest = self.detectTabbed(lines)
      +        self.parser = MarkdownParser()
      +        self.safeMode = safe_mode
      +        self.registeredExtensions = []
      +        self.docType = ""
      +        self.stripTopLevelTags = True
       
      -        pre = etree.SubElement(parentElem, "pre")
      -        code = etree.SubElement(pre, "code")
      -        
      -        text = "\n".join(detabbed).rstrip()+"\n"
      -        code.text = AtomicString(text)
      -        self._processSection(parentElem, theRest, inList)        
      +        self.textPreprocessors = [HTML_BLOCK_PREPROCESSOR]
       
      -    def detectTabbed(self, lines):
      -        """ Find indented text and remove indent before further proccesing.
      +        self.preprocessors = [HEADER_PREPROCESSOR,
      +                              LINE_PREPROCESSOR,
      +                              # A footnote preprocessor will
      +                              # get inserted here
      +                              REFERENCE_PREPROCESSOR]
       
      -        Keyword arguments:
      +
      +        self.postprocessors = [PRETTIFYPOSTPROCESSOR,
      +                               # a footnote postprocessor will get
      +                               # inserted later
      +                               ]
      +
      +        self.textPostprocessors = [# a footnote postprocessor will get
      +                                   # inserted here
      +                                   RAWHTMLTEXTPOSTPROCESSOR,
      +                                   AMPSUBSTITUTETEXTPOSTPROCESSOR]
      +
      +        self.prePatterns = []
      +                               
      +        self.inlinePatterns = [
      +                               BACKTICK_PATTERN,
      +                               ESCAPE_PATTERN,
      +                               REFERENCE_PATTERN,
      +                               LINK_PATTERN,
      +                               IMAGE_LINK_PATTERN,
      +                               IMAGE_REFERENCE_PATTERN,
      +                               AUTOLINK_PATTERN,
      +                               AUTOMAIL_PATTERN,
      +                               LINE_BREAK_PATTERN_2,
      +                               LINE_BREAK_PATTERN,
      +                               HTML_PATTERN,
      +                               ENTITY_PATTERN,
      +                               NOT_STRONG_PATTERN,
      +                               STRONG_EM_PATTERN,
      +                               STRONG_PATTERN,
      +                               EMPHASIS_PATTERN,
      +                               EMPHASIS_PATTERN_2
      +                               # The order of the handlers matters!!!
      +                               ]
               
      -        * lines: an array of strings
      -        * fn: a function that returns a substring of a string
      -           if the string matches the necessary criteria
      +        self.inlineStash = InlineStash()
      +        self.references = {}
      +        self.htmlStash = HtmlStash()
      +
      +
      +        self.registerExtensions(extensions = extensions,
      +                                configs = extension_configs)
      +
      +        self.reset()
      +
      +
      +    def registerExtensions(self, extensions, configs):
      +        """ 
      +        Register extensions with this instance of Markdown.
      +
      +        Keyword aurguments:
               
      -        Returns: a list of post processes items and the unused
      -        remainder of the original list
      +        * extensions: A list of extensions, which can either
      +           be strings or objects.  See the docstring on Markdown.
      +        * configs: A dictionary mapping module names to config options. 
               
               """
      -        items = []
      -        item = -1
      -        i = 0 # to keep track of where we are
      +        for ext in extensions:
      +            if isinstance(ext, basestring):
      +                ext = load_extension(ext, configs.get(ext, []))
      +            elif hasattr(ext, 'extendMarkdown'):
      +                # Looks like an Extension.
      +                # Nothing to do here.
      +                pass
      +            else:
      +                message(ERROR, "Incorrect type! Extension '%s' is "
      +                               "neither a string or an Extension." %(repr(ext)))
      +                continue
      +            ext.extendMarkdown(self, globals())
       
      -        def detab(line):
      -            match = CORE_RE['tabbed'].match(line)
      -            if match:
      -               return match.group(4)
      +    def registerExtension(self, extension):
      +        """ This gets called by the extension """
      +        self.registeredExtensions.append(extension)
       
      -        for line in lines:
      -            if line.strip(): # Non-blank line
      -                line = detab(line)
      -                if line:
      -                    items.append(line)
      -                    i += 1
      -                    continue
      -                else:
      -                    return items, lines[i:]
      +    def reset(self):
      +        """
      +        Resets all state variables so that we can start with a new text.
      +        """
      +        self.inlineStash.rest()
      +        self.htmlStash.rest()
      +        self.references.clear()
       
      -            else: # Blank line: _maybe_ we are done.
      -                i += 1 # advance
      +        HTML_BLOCK_PREPROCESSOR.stash = self.htmlStash
      +        LINE_PREPROCESSOR.stash = self.htmlStash
      +        REFERENCE_PREPROCESSOR.references = self.references
      +        HTML_PATTERN.stash = self.htmlStash
      +        ENTITY_PATTERN.stash = self.htmlStash
      +        REFERENCE_PATTERN.references = self.references
      +        IMAGE_REFERENCE_PATTERN.references = self.references
      +        RAWHTMLTEXTPOSTPROCESSOR.stash = self.htmlStash
      +        RAWHTMLTEXTPOSTPROCESSOR.safeMode = self.safeMode
       
      -                # Find the next non-blank line
      -                for j in range(i, len(lines)):  
      -                    if lines[j].strip():
      -                        next_line = lines[j]; break
      -                else:
      -                    break # There is no more text; we are done.
      +        for extension in self.registeredExtensions:
      +            extension.reset()
       
      -                # Check if the next non-blank line is tabbed
      -                if detab(next_line): # Yes, more work to do.
      -                    items.append("")
      -                    continue
      -                else:
      -                    break # No, we are done.
      -        else:
      -            i += 1
      +        for pattern in self.inlinePatterns:
      +            pattern.safe_mode = self.safeMode
       
      -        return items, lines[i:]
      -        
           def _handleInline(self, data, patternIndex=0):
               """
               Process string with inline patterns and replace it
      @@ -1563,50 +1615,8 @@ class Markdown:
                   if not matched:
                       patternIndex += 1
               return data
      -    
      -    def _applyInline(self, pattern, data, patternIndex, startIndex=0):
      -        """ 
      -        Check if the line fits the pattern, create the necessary 
      -        elements, add it to InlineStash
      -        
      -        Keyword arguments:
      -        
      -        * data: the text to be processed
      -        * pattern: the pattern to be checked
      -        * patternIndex: index of current pattern
      -        * startIndex: string index, from which we starting search
      -
      -        Returns: String with placeholders instead of ElementTree elements.
      -        """
      -        match = pattern.getCompiledRegExp().match(data[startIndex:])
      -        leftData = data[:startIndex]
      - 
      -        if not match:
      -            return data, False, 0
       
      -        node = pattern.handleMatch(match)
      -     
      -        if node is None:
      -            return data, True, len(leftData) + match.span(len(match.groups()))[0]
      -        
      -        if not isString(node):         
      -            if not isinstance(node.text, AtomicString):
      -                # We need to process current node too
      -                for child in [node] + node.getchildren():
      -                    if not isString(node):
      -                        if child.text:
      -                            child.text = self._handleInline(child.text, 
      -                                                            patternIndex + 1)
      -                        if child.tail:
      -                            child.tail = self._handleInline(child.tail, 
      -                                                            patternIndex)
      -   
      -        pholder = self.inlineStash.add(node, pattern.type())
       
      -        return "%s%s%s%s" % (leftData, 
      -                             match.group(1), 
      -                             pholder, match.groups()[-1]), True, 0
      -   
           def _processElementText(self, node, subnode, isText=True):
               """
               Process placeholders in Element.text or Element.tail
      @@ -1706,6 +1716,51 @@ class Markdown:
                       data = ""
       
               return result
      +
      +    
      +    def _applyInline(self, pattern, data, patternIndex, startIndex=0):
      +        """ 
      +        Check if the line fits the pattern, create the necessary 
      +        elements, add it to InlineStash
      +        
      +        Keyword arguments:
      +        
      +        * data: the text to be processed
      +        * pattern: the pattern to be checked
      +        * patternIndex: index of current pattern
      +        * startIndex: string index, from which we starting search
      +
      +        Returns: String with placeholders instead of ElementTree elements.
      +        """
      +        match = pattern.getCompiledRegExp().match(data[startIndex:])
      +        leftData = data[:startIndex]
      + 
      +        if not match:
      +            return data, False, 0
      +
      +        node = pattern.handleMatch(match)
      +     
      +        if node is None:
      +            return data, True, len(leftData) + match.span(len(match.groups()))[0]
      +        
      +        if not isString(node):         
      +            if not isinstance(node.text, AtomicString):
      +                # We need to process current node too
      +                for child in [node] + node.getchildren():
      +                    if not isString(node):
      +                        if child.text:
      +                            child.text = self._handleInline(child.text, 
      +                                                            patternIndex + 1)
      +                        if child.tail:
      +                            child.tail = self._handleInline(child.tail, 
      +                                                            patternIndex)
      +   
      +        pholder = self.inlineStash.add(node, pattern.type())
      +
      +        return "%s%s%s%s" % (leftData, 
      +                             match.group(1), 
      +                             pholder, match.groups()[-1]), True, 0
      +
           
           def applyInlinePatterns(self, markdownTree):
               """
      @@ -1756,66 +1811,36 @@ class Markdown:
                      
               return markdownTree
       
      -    def markdownToTree(self, source=None):
      -        """Create ElementTree, without applying inline paterns.
      -        
      -        Keyword arguments:
      -        
      -        * source: An ascii or unicode string of Markdown formated text.
      +    def convert (self, source):
      +        """Convert markdown to serialized XHTML."""
       
      -        Returns: ElementTree object.
      -        """
      +        # Fixup the source text
      +        if not source:
      +            return u""  # a blank unicode string
               try:
      -            self.source = unicode(self.source)
      +            source = unicode(source)
               except UnicodeDecodeError:
      -            message(CRITICAL, 'UnicodeDecodeError: Markdown only accepts unicode or ascii  input.')
      +            message(CRITICAL, 'UnicodeDecodeError: Markdown only accepts unicode or ascii input.')
                   return u""
      -        
      -        # Fixup the source text
      -        self.source = self.source.replace(STX, "")
      -        self.source = self.source.replace(ETX, "")
      -        self.source = self.source.replace("\r\n", "\n").replace("\r", "\n")
      -        self.source += "\n\n"
      -        self.source = self.source.expandtabs(TAB_LENGTH)
       
      +        source = source.replace(STX, "")
      +        source = source.replace(ETX, "")
      +        source = source.replace("\r\n", "\n").replace("\r", "\n")
      +        source += "\n\n"
      +        source = source.expandtabs(TAB_LENGTH)
      +
      +        # Run the text preprocessors
               for pp in self.textPreprocessors:
      -            self.source = pp.run(self.source)
      +            source = pp.run(source)
       
      -        # Split into lines and run the preprocessors that will work with 
      -        # self.lines
      -        self.lines = self.source.split("\n")
      +        # Split into lines and run the line preprocessors.
      +        self.lines = source.split("\n")
               for prep in self.preprocessors :
                   self.lines = prep.run(self.lines)
       
      -        # Create a ElementTree from the lines
      -        self.root = etree.Element("div")
      -        buffer = []
      -        for line in self.lines:
      -            if line.startswith("#"):
      -                self._processSection(self.root, buffer)
      -                buffer = [line]
      -            else:
      -                buffer.append(line)
      -
      -        self._processSection(self.root, buffer)
      -    
      -        return etree.ElementTree(self.root)
      -
      -
      -    def convert (self, source):
      -        """Convert markdown to serialized XHTML.
      -
      -        Keyword arguments:
      -        
      -        * source: An ascii or unicode string of Markdown formated text.
      -
      -        """
      -        self.source = source
      -        if not self.source:
      -            return u""  # a blank unicode string
      +        # Parse the high-level elements.
      +        tree = self.parser.parseDocument(self.lines)
       
      -        # Build a tree from the Markdown source and get its root.
      -        tree = self.markdownToTree(source)
               root = self.applyInlinePatterns(tree).getroot()
       
               # Run the post-processors
      @@ -1836,98 +1861,47 @@ class Markdown:
       
               return xml.strip()
       
      -    def __str__(self):
      -        """ Report info about instance. Markdown always returns unicode."""
      -        if self.source is None:
      -            status = 'in which no source text has been assinged.'
      -        else:
      -            status = 'which contains %d chars and %d line(s) of source.'%\
      -                     (len(self.source), self.source.count('\n')+1)
      -        return 'An instance of "%s" %s'% (self.__class__, status)
      -
      -    __unicode__ = convert # markdown should always return a unicode string
      -
      -
      -"""
      -EXPORTED FUNCTIONS
      -=============================================================================
      -
      -Those are the two functions we really mean to export: markdown() and
      -markdownFromFile().
      -"""
      -
      -def markdownFromFile(input = None,
      -                     output = None,
      -                     extensions = [],
      -                     encoding = None,
      -                     safe = False):
      -    """Converts a markdown file and returns the HTML as a unicode string.
      -
      -    Used from the command-line, although may be useful in other situations. 
      -    Decodes the file using the provided encoding (defaults to utf-8), passes 
      -    the file content to markdown, and outputs the html to either the provided
      -    filename or stdout in the same encoding as the source file.
      -
      -    **Note:** This is the only place that decoding and encoding of unicode
      -    takes place in Python-Markdown.  (All other code is unicode-in /
      -    unicode-out.)
      -
      -    Keyword arguments:
      -
      -    * input: Name of source text file.
      -    * output: Name of output file. Writes to stdout if `None`.
      -    * extensions: A list of extension names (may contain config args).  
      -    * encoding: Encoding of input and output files. Defaults to utf-8.
      -    * safe_mode: Disallow raw html. One of "remove", "replace" or "escape".
      -
      -    """
      -    
      -    encoding = encoding or "utf-8"
      -
      -    # Read the source
      -    input_file = codecs.open(input, mode="r", encoding=encoding)
      -    text = input_file.read()
      -    input_file.close()
      -    text = text.lstrip(u'\ufeff') # remove the byte-order mark
      -
      -    # Convert
      -    html = markdown(text, extensions, safe_mode = safe)
      -
      -    # Write to file or stdout
      -    if output:
      -        output_file = codecs.open(output, "w", encoding=encoding)
      -        output_file.write(html)
      -        output_file.close()
      -    else:
      -        sys.stdout.write(html.encode(encoding))
      -
      -def markdown(text,
      -             extensions = [],
      -             safe_mode = False):
      -    """
      -    Convenience wrapper function for `Markdown` class.
      -
      -    Useful in a typical use case. Initializes an instance of the `Markdown` 
      -    class, loads any extensions and runs the parser on the given text. 
      -
      -    Keyword arguments:
      -
      -    * text: An ascii or Unicode string of Markdown formatted text.
      -    * extensions: A list of extension names (may contain config args).  
      -    * safe_mode: Disallow raw html. One of "remove", "replace" or "escape".
      +    def convertFile(input = None, output = None, encoding = None):
      +        """Converts a markdown file and returns the HTML as a unicode string.
       
      -    Returns: An HTML document as a string.
      +        Decodes the file using the provided encoding (defaults to utf-8),
      +        passes the file content to markdown, and outputs the html to either
      +        the provided stream or the file with provided name, using the same
      +        encoding as the source file.
       
      -    """
      -    message(DEBUG, "in markdown.markdown(), received text:\n%s" % text)
      +        **Note:** This is the only place that decoding and encoding of unicode
      +        takes place in Python-Markdown.  (All other code is unicode-in /
      +        unicode-out.)
       
      -    extensions = [load_extension(e) for e in extensions]
      +        Keyword arguments:
       
      -    md = Markdown(extensions=extensions,
      -                  safe_mode = safe_mode)
      +        * input: Name of source text file.
      +        * output: Name of output file. Writes to stdout if `None`.
      +        * extensions: A list of extension names (may contain config args).  
      +        * encoding: Encoding of input and output files. Defaults to utf-8.
      +        * safe_mode: Disallow raw html. One of "remove", "replace" or "escape".
       
      -    return md.convert(text)
      +        """
               
      +        encoding = encoding or "utf-8"
      +
      +        # Read the source
      +        input_file = codecs.open(input, mode="r", encoding=encoding)
      +        text = input_file.read()
      +        input_file.close()
      +        text = text.lstrip(u'\ufeff') # remove the byte-order mark
      +
      +        # Convert
      +        html = self.convert(text)
      +
      +        # Write to file or stdout
      +        if type(output) == type("string"):
      +            output_file = codecs.open(output, "w", encoding=encoding)
      +            output_file.write(html)
      +            output_file.close()
      +        else:
      +            output.write(html.encode(encoding))
      +
       
       """
       Extensions
      @@ -1966,65 +1940,113 @@ class Extension:
               
               This method must be overriden by every extension.
       
      -        Ketword arguments:
      +        Keyword arguments:
       
               * md: The Markdown instance.
       
      -        * md_globals: All global variables availabel in the markdown module
      -        namespace.
      +        * md_globals: Global variables in the markdown module namespace.
       
               """
               pass
       
       
       def load_extension(ext_name, configs = []):
      -    """ 
      -    Load extension by name, then return the module.
      +    """Load extension by name, then return the module.
           
           The extension name may contain arguments as part of the string in the 
      -    following format:
      -
      -        "extname(key1=value1,key2=value2)"
      -    
      -    Print an error message and exit on failure. 
      +    following format: "extname(key1=value1,key2=value2)"
           
           """
       
      -    # I am making the assumption that the order of config options
      -    # does not matter.
      +    # Parse extensions config params (ignore the order)
           configs = dict(configs)
      -    pos = ext_name.find("(") 
      +    pos = ext_name.find("(") # find the first "("
           if pos > 0:
               ext_args = ext_name[pos+1:-1]
               ext_name = ext_name[:pos]
               pairs = [x.split("=") for x in ext_args.split(",")]
               configs.update([(x.strip(), y.strip()) for (x, y) in pairs])
       
      +    # Setup the module names
           ext_module = 'markdown_extensions'
      -    module_name = '.'.join([ext_module, ext_name])
      -    extension_module_name = '_'.join(['mdx', ext_name])
      +    module_name_new_style = '.'.join([ext_module, ext_name])
      +    module_name_old_style = '_'.join(['mdx', ext_name])
       
      -    try:
      -            module = __import__(module_name, {}, {}, [ext_module])
      +    # Try loading the extention first from one place, then another
      +    try: # New style (markdown_extensons.)
      +        module = __import__(module_name_new_style, {}, {}, [ext_module])
           except ImportError:
      +        try: # Old style (mdx.)
      +            module = __import__(module_name_old_style)
      +        except ImportError:
      +            pass
      +
      +    if module :
      +        # If the module is loaded successfully, we expect it to define a
      +        # function called makeExtension()
               try:
      -            module = __import__(extension_module_name)
      +            return module.makeExtension(configs.items())
               except:
      -            message(WARN,
      -                "Failed loading extension '%s' from '%s' or '%s' "
      -                "- continuing without."
      -                % (ext_name, module_name, extension_module_name) )
      -            # Return a dummy (do nothing) Extension as silent failure
      -            return Extension(configs={})
      -
      -    return module.makeExtension(configs.items())    
      +            message(WARN, "Failed to instantiate extension '%s'" % ext_name)
      +    else:
      +       message(WARN, "Failed loading extension '%s' from '%s' or '%s'"
      +               % (ext_name, module_name_new_style, module_name_old_style))
       
      +def load_extensions(ext_names):
      +    """Loads multiple extensions"""
      +    extensions = []
      +    for ext_name in ext_names:
      +        extension = load_extension(ext_name)
      +        if extension:
      +            extensions.append(extension)
       
       # Extensions should use "markdown.etree" instead of "etree" (or do `from
       # markdown import etree`).  Do not import it by yourself.
       
       etree = importETree() 
       
      +"""
      +EXPORTED FUNCTIONS
      +=============================================================================
      +
      +Those are the two functions we really mean to export: markdown() and
      +markdownFromFile().
      +"""
      +
      +def markdown(text,
      +             extensions = [],
      +             safe_mode = False):
      +    """Convert a markdown string to HTML and return HTML as a unicode string.
      +
      +    This is a shortcut function for `Markdown` class to cover the most
      +    basic use case.  It initializes an instance of Markdown, loads the
      +    necessary extensions and runs the parser on the given text. 
      +
      +    Keyword arguments:
      +
      +    * text: Markdown formatted text as Unicode or ASCII string.
      +    * extensions: A list of extensions or extension names (may contain config args).  
      +    * safe_mode: Disallow raw html.  One of "remove", "replace" or "escape".
      +
      +    Returns: An HTML document as a string.
      +
      +    """
      +    md = Markdown(extensions=load_extensions(extensions),
      +                  safe_mode = safe_mode)
      +    return md.convert(text)
      +
      +
      +def markdownFromFile(input = None,
      +                     output = None,
      +                     extensions = [],
      +                     encoding = None,
      +                     safe = False):
      +
      +
      +    md = Markdown(extensions=load_extensions(extensions),
      +                  safe_mode = safe_mode)
      +    md.convertFile(input, output, encoding)
      +
       
       """
       COMMAND-LINE SPECIFIC STUFF
      -- 
      cgit v1.2.3