Replaced old core parser with new BlockParser and copied old core into extension.

author: Waylan Limberg <waylan@gmail.com> 2008-11-11 19:18:41 -0500
committer: Waylan Limberg <waylan@gmail.com> 2008-11-13 23:19:45 -0500
commit: ba147ca9b2eae544e802c8216936065d2d86a8d8 (patch)
tree: 5395a029c1a054de8f20cfadacdbbd0de10754f9
parent: e968bbf38cc6570da2ccc3c7e87d99a36758544c (diff)
download: markdown-ba147ca9b2eae544e802c8216936065d2d86a8d8.tar.gz
markdown-ba147ca9b2eae544e802c8216936065d2d86a8d8.tar.bz2
markdown-ba147ca9b2eae544e802c8216936065d2d86a8d8.zip
2 files changed, 709 insertions, 401 deletions
diff --git a/markdown.py b/markdown.py
index b534151..b40093f 100755
--- a/markdown.py
+++ b/markdown.py
@@ -98,24 +98,6 @@ INLINE_PLACEHOLDER_PREFIX = STX+"klzzwxh:"
 INLINE_PLACEHOLDER = INLINE_PLACEHOLDER_PREFIX + "%s" + ETX
 AMP_SUBSTITUTE = STX+"amp"+ETX
 
-def wrapRe(raw_re) : return re.compile("^%s$" % raw_re, re.DOTALL)
-CORE_RE = {
-    'header':          wrapRe(r'(#{1,6})[ \t]*(.*?)[ \t]*(#*)'), # # A title
-    'reference-def':   wrapRe(r'(\ ?\ ?\ ?)\[([^\]]*)\]:\s*([^ ]*)(.*)'),
-                               # [Google]: http://www.google.com/
-    'containsline':    wrapRe(r'([-]*)$|^([=]*)'), # -----, =====, etc.
-    'ol':              wrapRe(r'[ ]{0,3}[\d]*\.\s+(.*)'), # 1. text
-    'ul':              wrapRe(r'[ ]{0,3}[*+-]\s+(.*)'), # "* text"
-    'isline1':         wrapRe(r'(\**)'), # ***
-    'isline2':         wrapRe(r'(\-*)'), # ---
-    'isline3':         wrapRe(r'(\_*)'), # ___
-    'tabbed':          wrapRe(r'((\t)|(    ))(.*)'), # an indented line
-    'quoted':          wrapRe(r'[ ]{0,2}> ?(.*)'), # a quoted block ("> ...")
-    'containsline':    re.compile(r'^([-]*)$|^([=]*)$', re.M),
-    'attr':            re.compile("\{@([^\}]*)=([^\}]*)}") # {@id=123}
-}
-"""Basic and reusable regular expressions."""
-
 
 """
 AUXILIARY GLOBAL FUNCTIONS
@@ -163,11 +145,13 @@ def isBlockLevel(tag):
     """Check if the tag is a block level HTML tag."""
     return BLOCK_LEVEL_ELEMENTS.match(tag)
 
+ATTR_RE = re.compile("\{@([^\}]*)=([^\}]*)}") # {@id=123}
+
 def handleAttributes(text, parent):
     """Set values of an element based on attribute definitions ({@id=123})."""
     def attributeCallback(match):
         parent.set(match.group(1), match.group(2))
-    return CORE_RE['attr'].sub(attributeCallback, text)
+    return ATTR_RE.sub(attributeCallback, text)
 
 def dequote(string):
     """Remove quotes from around a string."""
@@ -211,369 +195,286 @@ inline elements such as **bold** or *italics*, but rather just catches blocks,
 lists, quotes, etc.
 """
 
-class MarkdownParser:
-    """Parser Markdown into a ElementTree."""
+class BlockProcessor:
+    """ Base class for block processors. """
+    def __init__(self, parser=None):
+        self.parser = parser
 
-    def __init__(self):
-        pass
+    def lastChild(self, parent):
+        """ Return the last child of an etree element. """
+        if len(parent):
+            return parent[-1]
+        else:
+            return None
 
-    def parseDocument(self, lines):
-        """Parse a markdown string into an ElementTree."""
-        # Create a ElementTree from the lines
-        root = etree.Element("div")
-        buffer = []
+    def detab(self, text):
+        """ Remove a tab from the front of each line of the given text. """
+        newtext = []
+        lines = text.split('\n')
         for line in lines:
-            if line.startswith("#"):
-                self.parseChunk(root, buffer)
-                buffer = [line]
+            if line.startswith(' '*4):
+                newtext.append(line[4:])
+            elif not line.strip():
+                newtext.append('')
             else:
-                buffer.append(line)
-
-        self.parseChunk(root, buffer)
-
-        return etree.ElementTree(root)
+                break
+        return '\n'.join(newtext), '\n'.join(lines[len(newtext):])
 
-    def parseChunk(self, parent_elem, lines, inList=0, looseList=0):
-        """Process a chunk of markdown-formatted text and attach the parse to
-        an ElementTree node.
+    def looseDetab(self, text):
+        """ Remove a tab from front of lines but allowing dedented lines. """
+        lines = text.split('\n')
+        for i in range(len(lines)):
+            if lines[i].startswith(' '*4):
+                lines[i] = lines[i][4:]
+        return '\n'.join(lines)
 
-        Process a section of a source document, looking for high
-        level structural elements like lists, block quotes, code
-        segments, html blocks, etc.  Some those then get stripped
-        of their high level markup (e.g. get unindented) and the
-        lower-level markup is processed recursively.
+    def test(self, parent, block):
+        """ Return boolean. Must be overriden by subclasses. """
+        pass
 
-        Keyword arguments:
+    def run(self, parent, blocks):
+        """ Run processor. Must be overridden by subclasses. """
 
-        * parent_elem: The ElementTree element to which the content will be
-                       added.
-        * lines: a list of lines
-        * inList: a level
 
-        Returns: None
+class ListIndentProcessor(BlockProcessor):
+    """ Process children of list items. """
 
-        """
-        # Loop through lines until none left.
-        while lines:
-            # Skipping empty line
-            if not lines[0]:
-                lines = lines[1:]
-                continue
+    def test(self, parent, block):
+        return block.startswith(' '*4) and parent[-1] and \
+                (parent[-1].tag == "ul" or parent[-1].tag == "ol")
 
-            # Check if this section starts with a list, a blockquote or
-            # a code block.  If so, process them.
-            processFn = { 'ul':     self.__processUList,
-                          'ol':     self.__processOList,
-                          'quoted': self.__processQuote,
-                          'tabbed': self.__processCodeBlock}
-            for regexp in ['ul', 'ol', 'quoted', 'tabbed']:
-                m = CORE_RE[regexp].match(lines[0])
-                if m:
-                    processFn[regexp](parent_elem, lines, inList)
-                    return
-
-            # We are NOT looking at one of the high-level structures like
-            # lists or blockquotes.  So, it's just a regular paragraph
-            # (though perhaps nested inside a list or something else).  If
-            # we are NOT inside a list, we just need to look for a blank
-            # line to find the end of the block.  If we ARE inside a
-            # list, however, we need to consider that a sublist does not
-            # need to be separated by a blank line.  Rather, the following
-            # markup is legal:
-            #
-            # * The top level list item
-            #
-            #     Another paragraph of the list.  This is where we are now.
-            #     * Underneath we might have a sublist.
-            #
-
-            if inList:
-                start, lines  = self.__linesUntil(lines, (lambda line:
-                                 CORE_RE['ul'].match(line)
-                                 or CORE_RE['ol'].match(line)
-                                                  or not line.strip()))
-                self.parseChunk(parent_elem, start, inList-1,
-                                looseList=looseList)
-                inList = inList-1
-
-            else: # Ok, so it's just a simple block
-                test = lambda line: not line.strip() or line[0] == '>'
-                paragraph, lines = self.__linesUntil(lines, test)
-                if len(paragraph) and paragraph[0].startswith('#'):
-                    self.__processHeader(parent_elem, paragraph)
-                elif len(paragraph) and CORE_RE["isline3"].match(paragraph[0]):
-                    self.__processHR(parent_elem)
-                    lines = paragraph[1:] + lines
-                elif paragraph:
-                    self.__processParagraph(parent_elem, paragraph,
-                                          inList, looseList)
-
-            if lines and not lines[0].strip():
-                lines = lines[1:]  # skip the first (blank) line
-
-    def __processHR(self, parentElem):
-        hr = etree.SubElement(parentElem, "hr")
-
-    def __processHeader(self, parentElem, paragraph):
-        m = CORE_RE['header'].match(paragraph[0])
-        if m:
-            level = len(m.group(1))
-            h = etree.SubElement(parentElem, "h%d" % level)
-            h.text = m.group(2).strip()
+    def run(self, parent, blocks):
+        block = blocks.pop(0)
+        sibling = self.lastChild(parent)
+        if len(sibling) and sibling[-1].tag == 'li':
+            self.parser.parseBlocks(sibling[-1], [self.looseDetab(block)])
         else:
-            message(CRITICAL, "We've got a problem header!")
+            li = etree.SubElement(sibling, 'li')
+            self.parser.parseBlocks(li, [self.looseDetab(block)])
 
-    def __processParagraph(self, parentElem, paragraph, inList, looseList):
 
-        if ( parentElem.tag == 'li'
-                and not (looseList or parentElem.getchildren())):
+class CodeBlockProcessor(BlockProcessor):
+    """ Process code blocks. """
 
-            # If this is the first paragraph inside "li", don't
-            # put <p> around it - append the paragraph bits directly
-            # onto parentElem
-            el = parentElem
+    def test(self, parent, block):
+        return block.startswith(' '*4)
+    
+    def run(self, parent, blocks):
+        sibling = self.lastChild(parent)
+        block = blocks.pop(0)
+        theRest = ''
+        if sibling and sibling.tag == "pre" and len(sibling) \
+                    and sibling[0].tag == "code":
+            code = sibling[0]
+            block, theRest = self.detab(block)
+            code.text = '%s\n%s\n' % (code.text, block.rstrip())
         else:
-            # Otherwise make a "p" element
-            el = etree.SubElement(parentElem, "p")
-
-        dump = []
-
-        # Searching for hr or header
-        for line in paragraph:
-            # it's hr
-            if CORE_RE["isline3"].match(line):
-                el.text = "\n".join(dump)
-                self.__processHR(el)
-                dump = []
-            # it's header
-            elif line.startswith("#"):
-                el.text = "\n".join(dump)
-                self.__processHeader(parentElem, [line])
-                dump = []
-            else:
-                dump.append(line)
-        if dump:
-            text = "\n".join(dump)
-            el.text = text
-
-    def __processUList(self, parentElem, lines, inList):
-        self.__processList(parentElem, lines, inList, listexpr='ul', tag='ul')
-
-    def __processOList(self, parentElem, lines, inList):
-        self.__processList(parentElem, lines, inList, listexpr='ol', tag='ol')
-
-    def __processList(self, parentElem, lines, inList, listexpr, tag):
-        """
-        Given a list of document lines starting with a list item,
-        finds the end of the list, breaks it up, and recursively
-        processes each list item and the remainder of the text file.
-
-        Keyword arguments:
-
-        * parentElem: A ElementTree element to which the content will be added
-        * lines: a list of lines
-        * inList: a level
+            pre = etree.SubElement(parent, 'pre')
+            code = etree.SubElement(pre, 'code')
+            block, theRest = self.detab(block)
+            code.text = '%s\n' % block.rstrip()
+        if theRest:
+            blocks.insert(0, theRest)
 
-        Returns: None
 
-        """
-        ul = etree.SubElement(parentElem, tag) # ul might actually be '<ol>'
-
-        looseList = 0
-
-        # Make a list of list items
-        items = []
-        item = -1
+class BlockQuoteProcessor(BlockProcessor):
 
-        i = 0  # a counter to keep track of where we are
-        for line in lines:
-            loose = 0
-            if not line.strip():
-                # If we see a blank line, this _might_ be the end of the list
-                i += 1
-                loose = 1
-
-                # Find the next non-blank line
-                for j in range(i, len(lines)):
-                    if lines[j].strip():
-                        next = lines[j]
-                        break
-                else:
-                    # There is no more text => end of the list
-                    break
+    RE = re.compile(r'^[ ]{0,3}>[ ](.*)')
 
-                # Check if the next non-blank line is still a part of the list
+    def test(self, parent, block):
+        return bool(self.RE.match(block))
 
-                if ( CORE_RE[listexpr].match(next) or
-                     CORE_RE['tabbed'].match(next) ):
-                    # get rid of any white space in the line
-                    items[item].append(line.strip())
-                    looseList = loose or looseList
-                    continue
-                else:
-                    break # found end of the list
-
-            # Now we need to detect list items (at the current level)
-            # while also detabing child elements if necessary
-
-            for expr in ['ul', 'ol', 'tabbed']:
-                m = CORE_RE[expr].match(line)
-                if m:
-                    if expr in ['ul', 'ol']:  # We are looking at a new item
-                        #if m.group(1) :
-                        # Removed the check to allow for a blank line
-                        # at the beginning of the list item
-                        items.append([m.group(1)])
-                        item += 1
-                    elif expr == 'tabbed':  # This line needs to be detabbed
-                        items[item].append(m.group(4)) #after the 'tab'
-                    i += 1
-                    break
-            else:
-                items[item].append(line)  # Just regular continuation
-                i += 1 # added on 2006.02.25
+    def run(self, parent, blocks):
+        block = '\n'.join([self.clean(line) for line in 
+                            blocks.pop(0).split('\n')])
+        sibling = self.lastChild(parent)
+        if sibling and sibling.tag == "blockquote":
+            quote = sibling
         else:
-            i += 1
-
-        # Add the ElementTree elements
-        for item in items:
-            li = etree.SubElement(ul, "li")
-            self.parseChunk(li, item, inList + 1, looseList = looseList)
+            quote = etree.SubElement(parent, 'blockquote')
+        self.parser.parseBlocks(quote, [block])
 
-        # Process the remaining part of the section
-        self.parseChunk(parentElem, lines[i:], inList)
-
-    def __linesUntil(self, lines, condition):
-        """
-        A utility function to break a list of lines upon the
-        first line that satisfied a condition.  The condition
-        argument should be a predicate function.
-
-        """
-        i = -1
-        for line in lines:
-            i += 1
-            if condition(line):
-                break
+    def clean(self, line):
+        """ Remove ``>`` from begining of a line. """
+        m = self.RE.match(line)
+        if m:
+            return m.group(1)
+        elif line.strip() == ">":
+            return ""
+        else:
+            return line
+
+class OListProcessor(BlockProcessor):
+    """ Process ordered list blocks. """
+
+    TAG = 'ol'
+    RE = re.compile(r'^[ ]{0,3}\d+\.[ ](.*)')
+
+    def test(self, parent, block):
+        return bool(self.RE.match(block))
+
+    def run(self, parent, blocks):
+        items = self.get_items(blocks.pop(0))
+        sibling = self.lastChild(parent)
+        if sibling and sibling.tag == self.TAG:
+            lst = sibling
+            # make sure previous item is in a p. 
+            if len(lst) and lst[-1].text and not len(lst[-1]):
+                p = etree.SubElement(lst[-1], 'p')
+                p.text = lst[-1].text
+                lst[-1].text = ''
+            # parse first block differently as it gets wrapped in a p.
+            li = etree.SubElement(lst, 'li')
+            self.parser.state = 'looselist'
+            firstitem = items.pop(0)
+            self.parser.parseBlocks(li, [firstitem])
+            self.parser.resetState()
         else:
-            i += 1
-        return lines[:i], lines[i:]
+            lst = etree.SubElement(parent, self.TAG)
+        self.parser.state = 'list'
+        for item in items:
+            li = etree.SubElement(lst, 'li')
+            self.parser.parseBlocks(li, [item])
+        self.parser.resetState()
 
-    def __processQuote(self, parentElem, lines, inList):
-        """
-        Given a list of document lines starting with a quote finds
-        the end of the quote, unindents it and recursively
-        processes the body of the quote and the remainder of the
-        text file.
+    def get_items(self, block):
+        """ Break a block into list items. """
+        items = []
+        for line in block.split('\n'):
+            m = self.RE.match(line)
+            if m:
+                items.append(m.group(1))
+            else:
+                items[-1] = '\n'.join([items[-1], line])
+        return items
 
-        Keyword arguments:
 
-        * parentElem: ElementTree element to which the content will be added
-        * lines: a list of lines
-        * inList: a level
+class UListProcessor(OListProcessor):
+    """ Process unordered list blocks. """
 
-        Returns: None
+    TAG = 'ul'
+    RE = re.compile(r'^[ ]{0,3}[*+-][ ](.*)')
 
-        """
-        dequoted = []
-        i = 0
-        blank_line = False # allow one blank line between paragraphs
-        for line in lines:
-            m = CORE_RE['quoted'].match(line)
-            if m:
-                dequoted.append(m.group(1))
-                i += 1
-                blank_line = False
-            elif not blank_line and line.strip() != '':
-                dequoted.append(line)
-                i += 1
-            elif not blank_line and line.strip() == '':
-                dequoted.append(line)
-                i += 1
-                blank_line = True
-            else:
-                break
 
-        blockquote = etree.SubElement(parentElem, "blockquote")
+class HashHeaderProcessor(BlockProcessor):
+    """ Process Hash Headers. """
 
-        self.parseChunk(blockquote, dequoted, inList)
-        self.parseChunk(parentElem, lines[i:], inList)
+    RE = re.compile(r'^(#{1,6})(.*?)#*$')
 
-    def __processCodeBlock(self, parentElem, lines, inList):
-        """
-        Given a list of document lines starting with a code block
-        finds the end of the block, puts it into the ElementTree verbatim
-        wrapped in ("<pre><code>") and recursively processes the
-        the remainder of the text file.
+    def test(self, parent, block):
+        return block.startswith('#')
 
-        Keyword arguments:
+    def run(self, parent, blocks):
+        lines = blocks.pop(0).split('\n')
+        line1 = lines.pop(0)
+        m = self.RE.match(line1)
+        if m:
+            h = etree.SubElement(parent, 'h%d' % len(m.group(1)))
+            h.text = m.group(2).strip()
+        else:
+            lines.insert(0, line1)
+        if len(lines):
+            blocks.insert(0, '\n'.join(lines))
 
-        * parentElem: ElementTree element to which the content will be added
-        * lines: a list of lines
-        * inList: a level
 
-        Returns: None
+class SHeaderProcessor(BlockProcessor):
+    """ Process Setext-style Headers. """
 
-        """
-        detabbed, theRest = self.detectTabbed(lines)
-        pre = etree.SubElement(parentElem, "pre")
-        code = etree.SubElement(pre, "code")
-        text = "\n".join(detabbed).rstrip()+"\n"
-        code.text = AtomicString(text)
-        self.parseChunk(parentElem, theRest, inList)
+    RE = re.compile(r'^.*?\n[=-]{3,}', re.MULTILINE)
 
-    def detectTabbed(self, lines):
-        """ Find indented text and remove indent before further proccesing.
+    def test(self, parent, block):
+        return bool(self.RE.match(block))
 
-        Keyword arguments:
+    def run(self, parent, blocks):
+        lines = blocks.pop(0).split('\n')
+        if lines[1].startswith('='):
+            level = 1
+        else:
+            level = 2
+        h = etree.SubElement(parent, 'h%d' % level)
+        h.text = lines[0].strip()
+        if len(lines) > 2:
+            blocks.insert(0, '\n'.join(lines[2:]))
 
-        * lines: an array of strings
-        * fn: a function that returns a substring of a string
-           if the string matches the necessary criteria
 
-        Returns: a list of post processes items and the unused
-        remainder of the original list
+class HRProcessor(BlockProcessor):
+    """ Process Horizontal Rules. """
 
-        """
-        items = []
-        item = -1
-        i = 0 # to keep track of where we are
+    RE = re.compile(r'([*_-][ ]?){3,}')
 
-        def detab(line):
-            match = CORE_RE['tabbed'].match(line)
-            if match:
-               return match.group(4)
+    def test(self, parent, block):
+        return bool(self.RE.search(block))
 
+    def run(self, parent, blocks):
+        # Check for lines in block before hr.
+        lines = blocks.pop(0).split('\n')
+        prelines = []
         for line in lines:
-            if line.strip(): # Non-blank line
-                line = detab(line)
-                if line:
-                    items.append(line)
-                    i += 1
-                    continue
-                else:
-                    return items, lines[i:]
+            m = self.RE.match(line)
+            if m:
+                break
+            else:
+                prelines.append(line)
+        if len(prelines):
+            self.parser.parseBlocks(parent, ['\n'.join(prelines)])
+        # create hr
+        hr = etree.SubElement(parent, 'hr')
+        # check for lines in block after hr.
+        lines = lines[len(prelines)+1:]
+        if len(lines):
+            blocks.insert(0, '\n'.join(lines))
+
+
+class PBlockProcessor(BlockProcessor):
+    """ Process Paragraph blocks. """
+
+    def test(self, parent, block):
+        return True
+
+    def run(self, parent, blocks):
+        block = blocks.pop(0)
+        if block.strip():
+            if self.parser.state == 'list':
+                parent.text = block
+            else:
+                p = etree.SubElement(parent, 'p')
+                p.text = block
 
-            else: # Blank line: _maybe_ we are done.
-                i += 1 # advance
 
-                # Find the next non-blank line
-                for j in range(i, len(lines)):
-                    if lines[j].strip():
-                        next_line = lines[j]; break
-                else:
-                    break # There is no more text; we are done.
+class BlockParser:
+    """ Parse Markdown blocks into an ElementTree object. """
 
-                # Check if the next non-blank line is tabbed
-                if detab(next_line): # Yes, more work to do.
-                    items.append("")
-                    continue
-                else:
-                    break # No, we are done.
-        else:
-            i += 1
+    def __init__(self):
+        self.blockprocessors = OrderedDict()
+        self.blockprocessors['indent'] = ListIndentProcessor(self)
+        self.blockprocessors['code'] = CodeBlockProcessor(self)
+        self.blockprocessors['hashheader'] = HashHeaderProcessor(self)
+        self.blockprocessors['sheader'] = SHeaderProcessor(self)
+        self.blockprocessors['hr'] = HRProcessor(self)
+        self.blockprocessors['olist'] = OListProcessor(self)
+        self.blockprocessors['ulist'] = UListProcessor(self)
+        self.blockprocessors['quote'] = BlockQuoteProcessor(self)
+        self.blockprocessors['paragraph'] = PBlockProcessor(self)
+        self.resetState()
+
+    def resetState(self):
+        self.state = ''
 
-        return items, lines[i:]
+    def parseDocument(self, lines):
+        """ Parse a markdown string into an ElementTree. """
+        # Create a ElementTree from the lines
+        root = etree.Element("div")
+        blocks = '\n'.join(lines).split('\n\n')
+        self.parseBlocks(root, blocks)
+        return etree.ElementTree(root)
+
+    def parseBlocks(self, parent, blocks):
+        """ Process blocks of markdown text and attach to given etree node. """
+        while blocks:
+           for processor in self.blockprocessors.values():
+               if processor.test(parent, blocks[0]):
+                   processor.run(parent, blocks)
+                   break
 
 
 """
@@ -725,75 +626,15 @@ class HtmlBlockPreprocessor(Preprocessor):
         return new_text.split("\n")
 
 
-class HeaderPreprocessor(Preprocessor):
-
-    """Replace underlined headers with hashed headers.
-
-    (To avoid the need for lookahead later.)
-
-    """
-
-    def run (self, lines):
-        i = -1
-        while i+1 < len(lines):
-            i = i+1
-            if not lines[i].strip():
-                continue
-
-            if lines[i].startswith("#"):
-                lines.insert(i+1, "\n")
-
-            if (i+1 <= len(lines)
-                  and lines[i+1]
-                  and lines[i+1][0] in ['-', '=']):
-
-                underline = lines[i+1].strip()
-
-                if underline == "="*len(underline):
-                    lines[i] = "# " + lines[i].strip()
-                    lines[i+1] = ""
-                elif underline == "-"*len(underline):
-                    lines[i] = "## " + lines[i].strip()
-                    lines[i+1] = ""
-
-        return lines
-
-
-class LinePreprocessor(Preprocessor):
-    """Convert HR lines to "___" format."""
-    blockquote_re = re.compile(r'^(> )+')
-
-    def run (self, lines):
-        for i in range(len(lines)):
-            prefix = ''
-            m = self.blockquote_re.search(lines[i])
-            if m:
-                prefix = m.group(0)
-            if self._isLine(lines[i][len(prefix):]):
-                lines[i] = prefix + "___"
-        return lines
-
-    def _isLine(self, block):
-        """Determine if a block should be replaced with an <HR>"""
-        if block.startswith("    "):
-            return False  # a code block
-        text = "".join([x for x in block if not x.isspace()])
-        if len(text) <= 2:
-            return False
-        for pattern in ['isline1', 'isline2', 'isline3']:
-            m = CORE_RE[pattern].match(text)
-            if (m and m.group(1)):
-                return True
-        else:
-            return False
+class ReferencePreprocessor(Preprocessor):
+    """ Remove reference definitions from text and store for later use. """
 
+    RE = re.compile(r'^(\ ?\ ?\ ?)\[([^\]]*)\]:\s*([^ ]*)(.*)$', re.DOTALL)
 
-class ReferencePreprocessor(Preprocessor):
-    """Remove reference definitions from the text and store them for later use."""
     def run (self, lines):
         new_text = [];
         for line in lines:
-            m = CORE_RE['reference-def'].match(line)
+            m = self.RE.match(line)
             if m:
                 id = m.group(2).strip().lower()
                 t = m.group(4).strip()  # potential title
@@ -1776,7 +1617,7 @@ class Markdown:
         * safe_mode: Disallow raw html. One of "remove", "replace" or "escape".
 
         """
-        self.parser = MarkdownParser()
+        self.parser = BlockParser()
         self.safeMode = safe_mode
         self.registeredExtensions = []
         self.docType = ""
@@ -1784,8 +1625,6 @@ class Markdown:
 
         self.preprocessors = OrderedDict()
         self.preprocessors["html_block"] =  HtmlBlockPreprocessor(self)
-        self.preprocessors["header"] = HeaderPreprocessor(self)
-        self.preprocessors["line"] =  LinePreprocessor(self)
         self.preprocessors["reference"] = ReferencePreprocessor(self)
         # footnote preprocessor will be inserted with "<reference"
 
diff --git a/markdown_extensions/legacy.py b/markdown_extensions/legacy.py
new file mode 100644
index 0000000..1320734
--- /dev/null
+++ b/markdown_extensions/legacy.py
@@ -0,0 +1,469 @@
+"""
+Legacy Extension for Python-Markdown
+====================================
+
+Replaces the core parser with the old one.
+
+"""
+
+import markdown
+
+"""Basic and reusable regular expressions."""
+
+def wrapRe(raw_re) : return re.compile("^%s$" % raw_re, re.DOTALL)
+CORE_RE = {
+    'header':          wrapRe(r'(#{1,6})[ \t]*(.*?)[ \t]*(#*)'), # # A title
+    'reference-def':   wrapRe(r'(\ ?\ ?\ ?)\[([^\]]*)\]:\s*([^ ]*)(.*)'),
+                               # [Google]: http://www.google.com/
+    'containsline':    wrapRe(r'([-]*)$|^([=]*)'), # -----, =====, etc.
+    'ol':              wrapRe(r'[ ]{0,3}[\d]*\.\s+(.*)'), # 1. text
+    'ul':              wrapRe(r'[ ]{0,3}[*+-]\s+(.*)'), # "* text"
+    'isline1':         wrapRe(r'(\**)'), # ***
+    'isline2':         wrapRe(r'(\-*)'), # ---
+    'isline3':         wrapRe(r'(\_*)'), # ___
+    'tabbed':          wrapRe(r'((\t)|(    ))(.*)'), # an indented line
+    'quoted':          wrapRe(r'[ ]{0,2}> ?(.*)'), # a quoted block ("> ...")
+    'containsline':    re.compile(r'^([-]*)$|^([=]*)$', re.M),
+    'attr':            re.compile("\{@([^\}]*)=([^\}]*)}") # {@id=123}
+}
+
+class MarkdownParser:
+    """Parser Markdown into a ElementTree."""
+
+    def __init__(self):
+        pass
+
+    def parseDocument(self, lines):
+        """Parse a markdown string into an ElementTree."""
+        # Create a ElementTree from the lines
+        root = etree.Element("div")
+        buffer = []
+        for line in lines:
+            if line.startswith("#"):
+                self.parseChunk(root, buffer)
+                buffer = [line]
+            else:
+                buffer.append(line)
+
+        self.parseChunk(root, buffer)
+
+        return etree.ElementTree(root)
+
+    def parseChunk(self, parent_elem, lines, inList=0, looseList=0):
+        """Process a chunk of markdown-formatted text and attach the parse to
+        an ElementTree node.
+
+        Process a section of a source document, looking for high
+        level structural elements like lists, block quotes, code
+        segments, html blocks, etc.  Some those then get stripped
+        of their high level markup (e.g. get unindented) and the
+        lower-level markup is processed recursively.
+
+        Keyword arguments:
+
+        * parent_elem: The ElementTree element to which the content will be
+                       added.
+        * lines: a list of lines
+        * inList: a level
+
+        Returns: None
+
+        """
+        # Loop through lines until none left.
+        while lines:
+            # Skipping empty line
+            if not lines[0]:
+                lines = lines[1:]
+                continue
+
+            # Check if this section starts with a list, a blockquote or
+            # a code block.  If so, process them.
+            processFn = { 'ul':     self.__processUList,
+                          'ol':     self.__processOList,
+                          'quoted': self.__processQuote,
+                          'tabbed': self.__processCodeBlock}
+            for regexp in ['ul', 'ol', 'quoted', 'tabbed']:
+                m = CORE_RE[regexp].match(lines[0])
+                if m:
+                    processFn[regexp](parent_elem, lines, inList)
+                    return
+
+            # We are NOT looking at one of the high-level structures like
+            # lists or blockquotes.  So, it's just a regular paragraph
+            # (though perhaps nested inside a list or something else).  If
+            # we are NOT inside a list, we just need to look for a blank
+            # line to find the end of the block.  If we ARE inside a
+            # list, however, we need to consider that a sublist does not
+            # need to be separated by a blank line.  Rather, the following
+            # markup is legal:
+            #
+            # * The top level list item
+            #
+            #     Another paragraph of the list.  This is where we are now.
+            #     * Underneath we might have a sublist.
+            #
+
+            if inList:
+                start, lines  = self.__linesUntil(lines, (lambda line:
+                                 CORE_RE['ul'].match(line)
+                                 or CORE_RE['ol'].match(line)
+                                                  or not line.strip()))
+                self.parseChunk(parent_elem, start, inList-1,
+                                looseList=looseList)
+                inList = inList-1
+
+            else: # Ok, so it's just a simple block
+                test = lambda line: not line.strip() or line[0] == '>'
+                paragraph, lines = self.__linesUntil(lines, test)
+                if len(paragraph) and paragraph[0].startswith('#'):
+                    self.__processHeader(parent_elem, paragraph)
+                elif len(paragraph) and CORE_RE["isline3"].match(paragraph[0]):
+                    self.__processHR(parent_elem)
+                    lines = paragraph[1:] + lines
+                elif paragraph:
+                    self.__processParagraph(parent_elem, paragraph,
+                                          inList, looseList)
+
+            if lines and not lines[0].strip():
+                lines = lines[1:]  # skip the first (blank) line
+
+    def __processHR(self, parentElem):
+        hr = etree.SubElement(parentElem, "hr")
+
+    def __processHeader(self, parentElem, paragraph):
+        m = CORE_RE['header'].match(paragraph[0])
+        if m:
+            level = len(m.group(1))
+            h = etree.SubElement(parentElem, "h%d" % level)
+            h.text = m.group(2).strip()
+        else:
+            message(CRITICAL, "We've got a problem header!")
+
+    def __processParagraph(self, parentElem, paragraph, inList, looseList):
+
+        if ( parentElem.tag == 'li'
+                and not (looseList or parentElem.getchildren())):
+
+            # If this is the first paragraph inside "li", don't
+            # put <p> around it - append the paragraph bits directly
+            # onto parentElem
+            el = parentElem
+        else:
+            # Otherwise make a "p" element
+            el = etree.SubElement(parentElem, "p")
+
+        dump = []
+
+        # Searching for hr or header
+        for line in paragraph:
+            # it's hr
+            if CORE_RE["isline3"].match(line):
+                el.text = "\n".join(dump)
+                self.__processHR(el)
+                dump = []
+            # it's header
+            elif line.startswith("#"):
+                el.text = "\n".join(dump)
+                self.__processHeader(parentElem, [line])
+                dump = []
+            else:
+                dump.append(line)
+        if dump:
+            text = "\n".join(dump)
+            el.text = text
+
+    def __processUList(self, parentElem, lines, inList):
+        self.__processList(parentElem, lines, inList, listexpr='ul', tag='ul')
+
+    def __processOList(self, parentElem, lines, inList):
+        self.__processList(parentElem, lines, inList, listexpr='ol', tag='ol')
+
+    def __processList(self, parentElem, lines, inList, listexpr, tag):
+        """
+        Given a list of document lines starting with a list item,
+        finds the end of the list, breaks it up, and recursively
+        processes each list item and the remainder of the text file.
+
+        Keyword arguments:
+
+        * parentElem: A ElementTree element to which the content will be added
+        * lines: a list of lines
+        * inList: a level
+
+        Returns: None
+
+        """
+        ul = etree.SubElement(parentElem, tag) # ul might actually be '<ol>'
+
+        looseList = 0
+
+        # Make a list of list items
+        items = []
+        item = -1
+
+        i = 0  # a counter to keep track of where we are
+        for line in lines:
+            loose = 0
+            if not line.strip():
+                # If we see a blank line, this _might_ be the end of the list
+                i += 1
+                loose = 1
+
+                # Find the next non-blank line
+                for j in range(i, len(lines)):
+                    if lines[j].strip():
+                        next = lines[j]
+                        break
+                else:
+                    # There is no more text => end of the list
+                    break
+
+                # Check if the next non-blank line is still a part of the list
+
+                if ( CORE_RE[listexpr].match(next) or
+                     CORE_RE['tabbed'].match(next) ):
+                    # get rid of any white space in the line
+                    items[item].append(line.strip())
+                    looseList = loose or looseList
+                    continue
+                else:
+                    break # found end of the list
+
+            # Now we need to detect list items (at the current level)
+            # while also detabing child elements if necessary
+
+            for expr in ['ul', 'ol', 'tabbed']:
+                m = CORE_RE[expr].match(line)
+                if m:
+                    if expr in ['ul', 'ol']:  # We are looking at a new item
+                        #if m.group(1) :
+                        # Removed the check to allow for a blank line
+                        # at the beginning of the list item
+                        items.append([m.group(1)])
+                        item += 1
+                    elif expr == 'tabbed':  # This line needs to be detabbed
+                        items[item].append(m.group(4)) #after the 'tab'
+                    i += 1
+                    break
+            else:
+                items[item].append(line)  # Just regular continuation
+                i += 1 # added on 2006.02.25
+        else:
+            i += 1
+
+        # Add the ElementTree elements
+        for item in items:
+            li = etree.SubElement(ul, "li")
+            self.parseChunk(li, item, inList + 1, looseList = looseList)
+
+        # Process the remaining part of the section
+        self.parseChunk(parentElem, lines[i:], inList)
+
+    def __linesUntil(self, lines, condition):
+        """
+        A utility function to break a list of lines upon the
+        first line that satisfied a condition.  The condition
+        argument should be a predicate function.
+
+        """
+        i = -1
+        for line in lines:
+            i += 1
+            if condition(line):
+                break
+        else:
+            i += 1
+        return lines[:i], lines[i:]
+
+    def __processQuote(self, parentElem, lines, inList):
+        """
+        Given a list of document lines starting with a quote finds
+        the end of the quote, unindents it and recursively
+        processes the body of the quote and the remainder of the
+        text file.
+
+        Keyword arguments:
+
+        * parentElem: ElementTree element to which the content will be added
+        * lines: a list of lines
+        * inList: a level
+
+        Returns: None
+
+        """
+        dequoted = []
+        i = 0
+        blank_line = False # allow one blank line between paragraphs
+        for line in lines:
+            m = CORE_RE['quoted'].match(line)
+            if m:
+                dequoted.append(m.group(1))
+                i += 1
+                blank_line = False
+            elif not blank_line and line.strip() != '':
+                dequoted.append(line)
+                i += 1
+            elif not blank_line and line.strip() == '':
+                dequoted.append(line)
+                i += 1
+                blank_line = True
+            else:
+                break
+
+        blockquote = etree.SubElement(parentElem, "blockquote")
+
+        self.parseChunk(blockquote, dequoted, inList)
+        self.parseChunk(parentElem, lines[i:], inList)
+
+    def __processCodeBlock(self, parentElem, lines, inList):
+        """
+        Given a list of document lines starting with a code block
+        finds the end of the block, puts it into the ElementTree verbatim
+        wrapped in ("<pre><code>") and recursively processes the
+        the remainder of the text file.
+
+        Keyword arguments:
+
+        * parentElem: ElementTree element to which the content will be added
+        * lines: a list of lines
+        * inList: a level
+
+        Returns: None
+
+        """
+        detabbed, theRest = self.detectTabbed(lines)
+        pre = etree.SubElement(parentElem, "pre")
+        code = etree.SubElement(pre, "code")
+        text = "\n".join(detabbed).rstrip()+"\n"
+        code.text = AtomicString(text)
+        self.parseChunk(parentElem, theRest, inList)
+
+    def detectTabbed(self, lines):
+        """ Find indented text and remove indent before further proccesing.
+
+        Keyword arguments:
+
+        * lines: an array of strings
+        * fn: a function that returns a substring of a string
+           if the string matches the necessary criteria
+
+        Returns: a list of post processes items and the unused
+        remainder of the original list
+
+        """
+        items = []
+        item = -1
+        i = 0 # to keep track of where we are
+
+        def detab(line):
+            match = CORE_RE['tabbed'].match(line)
+            if match:
+               return match.group(4)
+
+        for line in lines:
+            if line.strip(): # Non-blank line
+                line = detab(line)
+                if line:
+                    items.append(line)
+                    i += 1
+                    continue
+                else:
+                    return items, lines[i:]
+
+            else: # Blank line: _maybe_ we are done.
+                i += 1 # advance
+
+                # Find the next non-blank line
+                for j in range(i, len(lines)):
+                    if lines[j].strip():
+                        next_line = lines[j]; break
+                else:
+                    break # There is no more text; we are done.
+
+                # Check if the next non-blank line is tabbed
+                if detab(next_line): # Yes, more work to do.
+                    items.append("")
+                    continue
+                else:
+                    break # No, we are done.
+        else:
+            i += 1
+
+        return items, lines[i:]
+
+class HeaderPreprocessor(Preprocessor):
+
+    """Replace underlined headers with hashed headers.
+
+    (To avoid the need for lookahead later.)
+
+    """
+
+    def run (self, lines):
+        i = -1
+        while i+1 < len(lines):
+            i = i+1
+            if not lines[i].strip():
+                continue
+
+            if lines[i].startswith("#"):
+                lines.insert(i+1, "\n")
+
+            if (i+1 <= len(lines)
+                  and lines[i+1]
+                  and lines[i+1][0] in ['-', '=']):
+
+                underline = lines[i+1].strip()
+
+                if underline == "="*len(underline):
+                    lines[i] = "# " + lines[i].strip()
+                    lines[i+1] = ""
+                elif underline == "-"*len(underline):
+                    lines[i] = "## " + lines[i].strip()
+                    lines[i+1] = ""
+
+        return lines
+
+
+class LinePreprocessor(Preprocessor):
+    """Convert HR lines to "___" format."""
+    blockquote_re = re.compile(r'^(> )+')
+
+    def run (self, lines):
+        for i in range(len(lines)):
+            prefix = ''
+            m = self.blockquote_re.search(lines[i])
+            if m:
+                prefix = m.group(0)
+            if self._isLine(lines[i][len(prefix):]):
+                lines[i] = prefix + "___"
+        return lines
+
+    def _isLine(self, block):
+        """Determine if a block should be replaced with an <HR>"""
+        if block.startswith("    "):
+            return False  # a code block
+        text = "".join([x for x in block if not x.isspace()])
+        if len(text) <= 2:
+            return False
+        for pattern in ['isline1', 'isline2', 'isline3']:
+            m = CORE_RE[pattern].match(text)
+            if (m and m.group(1)):
+                return True
+        else:
+            return False
+
+
+class LegacyExtension(markdown.Extension):
+    """ Replace Markdown's core parser. """
+
+    def extendMarkdown(self, md, md_globals):
+        """ Set the core parser to an instance of MarkdownParser. """
+        md.parser = MarkdownParser()
+        md.preprocessors.add ("header", HeaderPreprocessor(self), "<reference")
+        md.preprocessors.add("line",  LinePreprocessor(self), "<reference")
+ 
+
+def makeExtension(configs={}):
+    return LegacyExtension(configs=configs)
+
author	Waylan Limberg <waylan@gmail.com>	2008-11-11 19:18:41 -0500
committer	Waylan Limberg <waylan@gmail.com>	2008-11-13 23:19:45 -0500
commit	ba147ca9b2eae544e802c8216936065d2d86a8d8 (patch)
tree	5395a029c1a054de8f20cfadacdbbd0de10754f9
parent	e968bbf38cc6570da2ccc3c7e87d99a36758544c (diff)
download	markdown-ba147ca9b2eae544e802c8216936065d2d86a8d8.tar.gz markdown-ba147ca9b2eae544e802c8216936065d2d86a8d8.tar.bz2 markdown-ba147ca9b2eae544e802c8216936065d2d86a8d8.zip