Made significant improvments to comments and docstrings in BlockParser. Also renamed a few things to no longer use abbreviations.

author: Waylan Limberg <waylan@gmail.com> 2008-11-13 14:38:54 -0500
committer: Waylan Limberg <waylan@gmail.com> 2008-11-13 23:27:49 -0500
commit: 57a69d3bf45f41de4b9b2d646469436514be2475 (patch)
tree: c6102545c07206e7ede760c87f4b88c7fe32129f
parent: 2bf55d59937e91262677ef617e108d9178865454 (diff)
download: markdown-57a69d3bf45f41de4b9b2d646469436514be2475.tar.gz
markdown-57a69d3bf45f41de4b9b2d646469436514be2475.tar.bz2
markdown-57a69d3bf45f41de4b9b2d646469436514be2475.zip
1 files changed, 186 insertions, 29 deletions
diff --git a/markdown.py b/markdown.py
index 3ae26e3..9a8e2d4 100755
--- a/markdown.py
+++ b/markdown.py
@@ -166,37 +166,53 @@ def dequote(string):
 OVERALL DESIGN
 =============================================================================
 
-Markdown processing takes place in three steps:
+Markdown processing takes place in four steps:
 
 1. A bunch of "preprocessors" munge the input text.
-2. MarkdownParser() parses the high-level structural elements of the
+2. BlockParser() parses the high-level structural elements of the
    pre-processed text into an ElementTree.
-3. A bunch of Patterns are run against the ElementTree, detecting inline
+3. A bunch of "treeprocessors" are run against the ElementTree. One such
+   treeprocessor runs InlinePatterns against the ElementTree, detecting inline
    markup.
-4. Some extra use-defined post-processors are run.
+4. Some post-processors are run against the text after the ElementTree has
+   been serialized into text.
 5. The output is written to a string.
 
 Those steps are put together by the Markdown() class.
 
 The code below is organized as follows:
 
-1. MarkdownParser class - does basic parsing.
-2. All the post-processors, patterns, etc.
+1. BlockParser and it's BlockProcessors - does core block parsing.
+2. All the preprocessors, patterns, treeprocessors, and postprocessors.
 3. Markdown class - does the high-level wrapping.
 """
 
 
 """
-CORE MARKDOWN PARSER
+CORE MARKDOWN BLOCKPARSER
 =============================================================================
 
-This class handles basic Markdown parsing.  It doesn't concern itself with
-inline elements such as **bold** or *italics*, but rather just catches blocks,
-lists, quotes, etc.
+This parser handles basic parsing of Markdown blocks.  It doesn't concern itself
+with inline elements such as **bold** or *italics*, but rather just catches 
+blocks, lists, quotes, etc.
+
+The BlockParser is made up of a bunch of BlockProssors, each handling a 
+different type of block. Extensions may add/replace/remove BlockProcessors
+as they need to alter how markdown blocks are parsed.
+
 """
 
 class BlockProcessor:
-    """ Base class for block processors. """
+    """ Base class for block processors. 
+    
+    Each subclass will provide the methods below to work with the source and
+    tree. Each processor will need to define it's own ``test`` and ``run``
+    methods. The ``test`` method should return True or False, to indicate
+    whether the current block should be processed by this processor. If the
+    test passes, the parser will call the processors ``run`` method.
+
+    """
+
     def __init__(self, parser=None):
         self.parser = parser
 
@@ -229,15 +245,60 @@ class BlockProcessor:
         return '\n'.join(lines)
 
     def test(self, parent, block):
-        """ Return boolean. Must be overriden by subclasses. """
+        """ Test for block type. Must be overridden by subclasses. 
+        
+        As the parser loops through processors, it will call the ``test`` method
+        on each to determine if the given block of text is of that type. This
+        method must return a boolean ``True`` or ``False``. The actual method of
+        testing is left to the needs of that particular block type. It could 
+        be as simple as ``block.startswith(some_string)`` or a complex regular
+        expression. As the block type may be different depending on the parent
+        of the block (i.e. inside a list), the parent etree element is also 
+        provided and may be used as part of the test.
+
+        Keywords:
+        
+        * ``parent``: A etree element which will be the parent of the block.
+        * ``block``: A block of text from the source which has been split at 
+            blank lines.
+        """
         pass
 
     def run(self, parent, blocks):
-        """ Run processor. Must be overridden by subclasses. """
+        """ Run processor. Must be overridden by subclasses. 
+        
+        When the parser determines the appropriate type of a block, the parser
+        will call the corresponding processor's ``run`` method. This method
+        should parse the individual lines of the block and append them to
+        the etree. 
+
+        Note that both the ``parent`` and ``etree`` keywords are pointers
+        to instances of the objects which should be edited in place. Each
+        processor must make changes to the existing objects as there is no
+        mechanism to return new/different objects to replace them.
+
+        This means that this method should be adding SubElements or adding text
+        to the parent, and should remove (``pop``) or add (``insert``) items to
+        the list of blocks.
+
+        Keywords:
+
+        * ``parent``: A etree element which is the parent of the current block.
+        * ``blocks``: A list of all remaining blocks of the document.
+        """
+        pass
 
 
 class ListIndentProcessor(BlockProcessor):
-    """ Process children of list items. """
+    """ Process children of list items. 
+    
+    Example:
+        * a list item
+            process this part
+
+            or this part
+
+    """
 
     def test(self, parent, block):
         return block.startswith(' '*4) and \
@@ -251,13 +312,18 @@ class ListIndentProcessor(BlockProcessor):
         block = self.looseDetab(blocks.pop(0))
         sibling = self.lastChild(parent)
         if parent.tag == 'li':
+            # The parent is already a li. Just parse the child block.
             self.parser.parseBlocks(parent, [block])
         elif len(sibling) and sibling[-1].tag == 'li':
+            # The parent is a list (``ol`` or ``ul``) which has children.
+            # Assume the last child li is the parent of this block.
             if sibling[-1].text:
+                # If the parent li has text, that text needs to be moved to a p
                 block = '%s\n\n%s' % (sibling[-1].text, block)
                 sibling[-1].text = ''
             self.parser.parseChunk(sibling[-1], block)
         else:
+            # Create a new li and parse the block with it as the parent.
             li = etree.SubElement(sibling, 'li')
             self.parser.parseBlocks(li, [block])
 
@@ -274,15 +340,22 @@ class CodeBlockProcessor(BlockProcessor):
         theRest = ''
         if sibling and sibling.tag == "pre" and len(sibling) \
                     and sibling[0].tag == "code":
+            # The previous block was a code block. As blank lines do not start
+            # new code blocks, append this block to the previous, adding back
+            # linebreaks removed from the split into a list.
             code = sibling[0]
             block, theRest = self.detab(block)
             code.text = AtomicString('%s\n%s\n' % (code.text, block.rstrip()))
         else:
+            # This is a new codeblock. Create the elements and insert text.
             pre = etree.SubElement(parent, 'pre')
             code = etree.SubElement(pre, 'code')
             block, theRest = self.detab(block)
             code.text = AtomicString('%s\n' % block.rstrip())
         if theRest:
+            # This block contained unindented line(s) after the first indented 
+            # line. Insert these lines as the first block of the master blocks
+            # list for future processing.
             blocks.insert(0, theRest)
 
 
@@ -298,13 +371,16 @@ class BlockQuoteProcessor(BlockProcessor):
                             blocks.pop(0).split('\n')])
         sibling = self.lastChild(parent)
         if sibling and sibling.tag == "blockquote":
+            # Previous block was a blockquote so set that as this blocks parent
             quote = sibling
         else:
+            # This is a new blockquote. Create a new parent element.
             quote = etree.SubElement(parent, 'blockquote')
+        # Recursively parse block with blockquote as parent.
         self.parser.parseChunk(quote, block)
 
     def clean(self, line):
-        """ Remove ``>`` from begining of a line. """
+        """ Remove ``>`` from beginning of a line. """
         m = self.RE.match(line)
         if line.strip() == ">":
             return ""
@@ -317,17 +393,22 @@ class OListProcessor(BlockProcessor):
     """ Process ordered list blocks. """
 
     TAG = 'ol'
+    # Detect an item (``1. item``). ``group(1)`` contains contents of item.
     RE = re.compile(r'^[ ]{0,3}\d+\.[ ](.*)')
+    # Detect items on secondary lines. they can be of either list type.
     CHILD_RE = re.compile(r'^[ ]{0,3}((\d+\.)|[*+-])[ ](.*)')
+    # Detect indented (nested) items of either type
     INDENT_RE = re.compile(r'^[ ]{4,7}((\d+\.)|[*+-])[ ].*')
 
     def test(self, parent, block):
         return bool(self.RE.match(block))
 
     def run(self, parent, blocks):
+        # Check fr multiple items in one block.
         items = self.get_items(blocks.pop(0))
         sibling = self.lastChild(parent)
         if sibling and (sibling.tag == 'ol' or sibling.tag == 'ul'):
+            # Previous block was a list item, so set that as parent
             lst = sibling
             # make sure previous item is in a p.
             if len(lst) and lst[-1].text and not len(lst[-1]):
@@ -341,12 +422,17 @@ class OListProcessor(BlockProcessor):
             self.parser.parseBlocks(li, [firstitem])
             self.parser.state.reset()
         else:
+            # This is a new list so create parent with appropriate tag.
             lst = etree.SubElement(parent, self.TAG)
         self.parser.state.set('list')
+        # Loop through items in block, recursively parsing each with the
+        # appropriate parent.
         for item in items:
             if item.startswith(' '*4):
+                # Item is indented. Parse with last item as parent
                 self.parser.parseBlocks(lst[-1], [item])
             else:
+                # New item. Create li and parse with it as parent
                 li = etree.SubElement(lst, 'li')
                 self.parser.parseBlocks(li, [item])
         self.parser.state.reset()
@@ -357,13 +443,17 @@ class OListProcessor(BlockProcessor):
         for line in block.split('\n'):
             m = self.CHILD_RE.match(line)
             if m:
+                # This is a new item. Append
                 items.append(m.group(3))
             elif self.INDENT_RE.match(line):
+                # This is an indented (possibly nested) item.
                 if items[-1].startswith(' '*4):
+                    # Previous item was indented. Append to that item.
                     items[-1] = '%s\n%s' % (items[-1], line)
                 else:
                     items.append(line)
             else:
+                # This is another line of previous item. Append to that item.
                 items[-1] = '%s\n%s' % (items[-1], line)
         return items
 
@@ -378,6 +468,7 @@ class UListProcessor(OListProcessor):
 class HashHeaderProcessor(BlockProcessor):
     """ Process Hash Headers. """
 
+    # Detect a header at start of any line in block
     RE = re.compile(r'(^|\n)(?P<level>#{1,6})(?P<header>.*?)#*(\n|$)')
 
     def test(self, parent, block):
@@ -387,21 +478,28 @@ class HashHeaderProcessor(BlockProcessor):
         block = blocks.pop(0)
         m = self.RE.search(block)
         if m:
-            before = block[:m.start()]
-            after = block[m.end():]
+            before = block[:m.start()] # All lines before header
+            after = block[m.end():]    # All lines after header
             if before:
+                # As the header was not the first line of the block and the
+                # lines before the header must be parsed first,
+                # recursively parse this lines as a block.
                 self.parser.parseBlocks(parent, [before])
+            # Create header using named groups from RE
             h = etree.SubElement(parent, 'h%d' % len(m.group('level')))
             h.text = m.group('header').strip()
             if after:
+                # Insert remaining lines as first block for future parsing.
                 blocks.insert(0, after)
         else:
+            # This should never happen, but just in case...
             message(CRITICAL, "We've got a problem header!")
 
 
-class SHeaderProcessor(BlockProcessor):
+class SetextHeaderProcessor(BlockProcessor):
     """ Process Setext-style Headers. """
 
+    # Detect Setext-style header. Must be first 2 lines of block.
     RE = re.compile(r'^.*?\n[=-]{3,}', re.MULTILINE)
 
     def test(self, parent, block):
@@ -409,6 +507,7 @@ class SHeaderProcessor(BlockProcessor):
 
     def run(self, parent, blocks):
         lines = blocks.pop(0).split('\n')
+        # Determine level. ``=`` is 1 and ``-`` is 2.
         if lines[1].startswith('='):
             level = 1
         else:
@@ -416,6 +515,7 @@ class SHeaderProcessor(BlockProcessor):
         h = etree.SubElement(parent, 'h%d' % level)
         h.text = lines[0].strip()
         if len(lines) > 2:
+            # Block contains additional lines. Add to  master blocks for later.
             blocks.insert(0, '\n'.join(lines[2:]))
 
 
@@ -423,17 +523,18 @@ class HRProcessor(BlockProcessor):
     """ Process Horizontal Rules. """
 
     RE = r'[ ]{0,3}(?P<ch>[*_-])[ ]?((?P=ch)[ ]?){2,}[ ]*'
+    # Detect hr on any line of a block.
     SEARCH_RE = re.compile(r'(^|\n)%s(\n|$)' % RE)
+    # Match a hr on a single line of text.
     MATCH_RE = re.compile(r'^%s$' % RE)
 
     def test(self, parent, block):
         return bool(self.SEARCH_RE.search(block))
 
     def run(self, parent, blocks):
-        # Check for lines in block before hr.
-        #import ipdb; ipdb.set_trace()
         lines = blocks.pop(0).split('\n')
         prelines = []
+        # Check for lines in block before hr.
         for line in lines:
             m = self.MATCH_RE.match(line)
             if m:
@@ -441,18 +542,22 @@ class HRProcessor(BlockProcessor):
             else:
                 prelines.append(line)
         if len(prelines):
+            # Recursively parse lines before hr so they get parsed first.
             self.parser.parseBlocks(parent, ['\n'.join(prelines)])
         # create hr
         hr = etree.SubElement(parent, 'hr')
         # check for lines in block after hr.
         lines = lines[len(prelines)+1:]
         if len(lines):
+            # Add lines after hr to master blocks for later parsing.
             blocks.insert(0, '\n'.join(lines))
 
 
 class EmptyBlockProcessor(BlockProcessor):
     """ Process blocks and start with an empty line. """
 
+    # Detect a block that only contains whitespace 
+    # or only whitespace on the first line.
     RE = re.compile(r'^\s*\n')
 
     def test(self, parent, block):
@@ -462,14 +567,16 @@ class EmptyBlockProcessor(BlockProcessor):
         block = blocks.pop(0)
         m = self.RE.match(block)
         if m:
+            # Add remaining line to master blocks for later.
             blocks.insert(0, block[m.end():])
             sibling = self.lastChild(parent)
             if sibling and sibling.tag == 'pre' and sibling[0] and \
                     sibling[0].tag == 'code':
+                # Last block is a codeblock. Append to preserve whitespace.
                 sibling[0].text = AtomicString('%s/n/n/n' % sibling[0].text )
 
 
-class PBlockProcessor(BlockProcessor):
+class ParagraphProcessor(BlockProcessor):
     """ Process Paragraph blocks. """
 
     def test(self, parent, block):
@@ -478,17 +585,35 @@ class PBlockProcessor(BlockProcessor):
     def run(self, parent, blocks):
         block = blocks.pop(0)
         if block.strip():
+            # Not a blank block. Add to parent, otherwise throw it away.
             if self.parser.state.isstate('list'):
+                # The parent is a tight-list. Append to parent.text
                 if parent.text:
                     parent.text = '%s\n%s' % (parent.text, block)
                 else:
                     parent.text = block.lstrip()
             else:
+                # Create a regular paragraph
                 p = etree.SubElement(parent, 'p')
                 p.text = block.lstrip()
 
 class State(list):
-    """ Track the current and nested stated of the parser. """
+    """ Track the current and nested state of the parser. 
+    
+    This utility class is used to track the state of the BlockParser and 
+    support multiple levels if nesting. It's just a simple API wrapped around
+    a list. Each time a state is set, that state is appended to the end of the
+    list. Each time a state is reset, that state is removed from the end of
+    the list.
+
+    Therefore, each time a state is set for a nested block, that state must be 
+    reset when we back out of that level of nesting or the state could be
+    corrupted.
+
+    While all the methods of a list object are available, only the three
+    defined below need be used.
+
+    """
 
     def set(self, state):
         """ Set a new state. """
@@ -499,14 +624,18 @@ class State(list):
         self.pop()
 
     def isstate(self, state):
-        """ Test that top level is of given state. """
+        """ Test that top (current) level is of given state. """
         if len(self):
             return self[-1] == state
         else:
             return False
 
 class BlockParser:
-    """ Parse Markdown blocks into an ElementTree object. """
+    """ Parse Markdown blocks into an ElementTree object. 
+    
+    A wrapper class that stitches the various BlockProcessors together,
+    looping through them and creating an ElementTree object.
+    """
 
     def __init__(self):
         self.blockprocessors = OrderedDict()
@@ -514,27 +643,55 @@ class BlockParser:
         self.blockprocessors['indent'] = ListIndentProcessor(self)
         self.blockprocessors['code'] = CodeBlockProcessor(self)
         self.blockprocessors['hashheader'] = HashHeaderProcessor(self)
-        self.blockprocessors['sheader'] = SHeaderProcessor(self)
+        self.blockprocessors['setextheader'] = SetextHeaderProcessor(self)
         self.blockprocessors['hr'] = HRProcessor(self)
         self.blockprocessors['olist'] = OListProcessor(self)
         self.blockprocessors['ulist'] = UListProcessor(self)
         self.blockprocessors['quote'] = BlockQuoteProcessor(self)
-        self.blockprocessors['paragraph'] = PBlockProcessor(self)
+        self.blockprocessors['paragraph'] = ParagraphProcessor(self)
         self.state = State()
 
     def parseDocument(self, lines):
-        """ Parse a markdown string into an ElementTree. """
+        """ Parse a markdown document into an ElementTree. 
+        
+        Given a list of lines, an ElementTree object (not just a parent Element)
+        is created and the root element is passed to the parser as the parent.
+        The ElementTree object is returned.
+        
+        This should only be called on an entire document, not pieces.
+
+        """
         # Create a ElementTree from the lines
         root = etree.Element("div")
         self.parseChunk(root, '\n'.join(lines))
         return etree.ElementTree(root)
 
     def parseChunk(self, parent, text):
-        """ Parse a chunk of markdown text and attach to given etree node. """
+        """ Parse a chunk of markdown text and attach to given etree node. 
+        
+        While the ``text`` argument is generally assumed to contain multiple
+        blocks which will be split on blank lines, it could contain only one
+        block. Generally, this method would be called by extensions when
+        block parsing is required. 
+        
+        The ``parent`` etree Element passed in is altered in place. 
+        Nothing is returned.
+
+        """
         self.parseBlocks(parent, text.split('\n\n'))
 
     def parseBlocks(self, parent, blocks):
-        """ Process blocks of markdown text and attach to given etree node. """
+        """ Process blocks of markdown text and attach to given etree node. 
+        
+        Given a list of ``blocks``, each blockprocessor is stepped through
+        until there are no blocks left. While an extension could potentially
+        call this method directly, it's generally expected to be used internally.
+
+        This is a public method as an extension may need to add/alter additional
+        BlockProcessors which call this method to recursively parse a nested
+        block.
+
+        """
         while blocks:
            for processor in self.blockprocessors.values():
                if processor.test(parent, blocks[0]):
author	Waylan Limberg <waylan@gmail.com>	2008-11-13 14:38:54 -0500
committer	Waylan Limberg <waylan@gmail.com>	2008-11-13 23:27:49 -0500
commit	57a69d3bf45f41de4b9b2d646469436514be2475 (patch)
tree	c6102545c07206e7ede760c87f4b88c7fe32129f
parent	2bf55d59937e91262677ef617e108d9178865454 (diff)
download	markdown-57a69d3bf45f41de4b9b2d646469436514be2475.tar.gz markdown-57a69d3bf45f41de4b9b2d646469436514be2475.tar.bz2 markdown-57a69d3bf45f41de4b9b2d646469436514be2475.zip