aboutsummaryrefslogtreecommitdiffstats
path: root/markdown.py
diff options
context:
space:
mode:
Diffstat (limited to 'markdown.py')
-rwxr-xr-xmarkdown.py215
1 files changed, 186 insertions, 29 deletions
diff --git a/markdown.py b/markdown.py
index 3ae26e3..9a8e2d4 100755
--- a/markdown.py
+++ b/markdown.py
@@ -166,37 +166,53 @@ def dequote(string):
OVERALL DESIGN
=============================================================================
-Markdown processing takes place in three steps:
+Markdown processing takes place in four steps:
1. A bunch of "preprocessors" munge the input text.
-2. MarkdownParser() parses the high-level structural elements of the
+2. BlockParser() parses the high-level structural elements of the
pre-processed text into an ElementTree.
-3. A bunch of Patterns are run against the ElementTree, detecting inline
+3. A bunch of "treeprocessors" are run against the ElementTree. One such
+ treeprocessor runs InlinePatterns against the ElementTree, detecting inline
markup.
-4. Some extra use-defined post-processors are run.
+4. Some post-processors are run against the text after the ElementTree has
+ been serialized into text.
5. The output is written to a string.
Those steps are put together by the Markdown() class.
The code below is organized as follows:
-1. MarkdownParser class - does basic parsing.
-2. All the post-processors, patterns, etc.
+1. BlockParser and it's BlockProcessors - does core block parsing.
+2. All the preprocessors, patterns, treeprocessors, and postprocessors.
3. Markdown class - does the high-level wrapping.
"""
"""
-CORE MARKDOWN PARSER
+CORE MARKDOWN BLOCKPARSER
=============================================================================
-This class handles basic Markdown parsing. It doesn't concern itself with
-inline elements such as **bold** or *italics*, but rather just catches blocks,
-lists, quotes, etc.
+This parser handles basic parsing of Markdown blocks. It doesn't concern itself
+with inline elements such as **bold** or *italics*, but rather just catches
+blocks, lists, quotes, etc.
+
+The BlockParser is made up of a bunch of BlockProssors, each handling a
+different type of block. Extensions may add/replace/remove BlockProcessors
+as they need to alter how markdown blocks are parsed.
+
"""
class BlockProcessor:
- """ Base class for block processors. """
+ """ Base class for block processors.
+
+ Each subclass will provide the methods below to work with the source and
+ tree. Each processor will need to define it's own ``test`` and ``run``
+ methods. The ``test`` method should return True or False, to indicate
+ whether the current block should be processed by this processor. If the
+ test passes, the parser will call the processors ``run`` method.
+
+ """
+
def __init__(self, parser=None):
self.parser = parser
@@ -229,15 +245,60 @@ class BlockProcessor:
return '\n'.join(lines)
def test(self, parent, block):
- """ Return boolean. Must be overriden by subclasses. """
+ """ Test for block type. Must be overridden by subclasses.
+
+ As the parser loops through processors, it will call the ``test`` method
+ on each to determine if the given block of text is of that type. This
+ method must return a boolean ``True`` or ``False``. The actual method of
+ testing is left to the needs of that particular block type. It could
+ be as simple as ``block.startswith(some_string)`` or a complex regular
+ expression. As the block type may be different depending on the parent
+ of the block (i.e. inside a list), the parent etree element is also
+ provided and may be used as part of the test.
+
+ Keywords:
+
+ * ``parent``: A etree element which will be the parent of the block.
+ * ``block``: A block of text from the source which has been split at
+ blank lines.
+ """
pass
def run(self, parent, blocks):
- """ Run processor. Must be overridden by subclasses. """
+ """ Run processor. Must be overridden by subclasses.
+
+ When the parser determines the appropriate type of a block, the parser
+ will call the corresponding processor's ``run`` method. This method
+ should parse the individual lines of the block and append them to
+ the etree.
+
+ Note that both the ``parent`` and ``etree`` keywords are pointers
+ to instances of the objects which should be edited in place. Each
+ processor must make changes to the existing objects as there is no
+ mechanism to return new/different objects to replace them.
+
+ This means that this method should be adding SubElements or adding text
+ to the parent, and should remove (``pop``) or add (``insert``) items to
+ the list of blocks.
+
+ Keywords:
+
+ * ``parent``: A etree element which is the parent of the current block.
+ * ``blocks``: A list of all remaining blocks of the document.
+ """
+ pass
class ListIndentProcessor(BlockProcessor):
- """ Process children of list items. """
+ """ Process children of list items.
+
+ Example:
+ * a list item
+ process this part
+
+ or this part
+
+ """
def test(self, parent, block):
return block.startswith(' '*4) and \
@@ -251,13 +312,18 @@ class ListIndentProcessor(BlockProcessor):
block = self.looseDetab(blocks.pop(0))
sibling = self.lastChild(parent)
if parent.tag == 'li':
+ # The parent is already a li. Just parse the child block.
self.parser.parseBlocks(parent, [block])
elif len(sibling) and sibling[-1].tag == 'li':
+ # The parent is a list (``ol`` or ``ul``) which has children.
+ # Assume the last child li is the parent of this block.
if sibling[-1].text:
+ # If the parent li has text, that text needs to be moved to a p
block = '%s\n\n%s' % (sibling[-1].text, block)
sibling[-1].text = ''
self.parser.parseChunk(sibling[-1], block)
else:
+ # Create a new li and parse the block with it as the parent.
li = etree.SubElement(sibling, 'li')
self.parser.parseBlocks(li, [block])
@@ -274,15 +340,22 @@ class CodeBlockProcessor(BlockProcessor):
theRest = ''
if sibling and sibling.tag == "pre" and len(sibling) \
and sibling[0].tag == "code":
+ # The previous block was a code block. As blank lines do not start
+ # new code blocks, append this block to the previous, adding back
+ # linebreaks removed from the split into a list.
code = sibling[0]
block, theRest = self.detab(block)
code.text = AtomicString('%s\n%s\n' % (code.text, block.rstrip()))
else:
+ # This is a new codeblock. Create the elements and insert text.
pre = etree.SubElement(parent, 'pre')
code = etree.SubElement(pre, 'code')
block, theRest = self.detab(block)
code.text = AtomicString('%s\n' % block.rstrip())
if theRest:
+ # This block contained unindented line(s) after the first indented
+ # line. Insert these lines as the first block of the master blocks
+ # list for future processing.
blocks.insert(0, theRest)
@@ -298,13 +371,16 @@ class BlockQuoteProcessor(BlockProcessor):
blocks.pop(0).split('\n')])
sibling = self.lastChild(parent)
if sibling and sibling.tag == "blockquote":
+ # Previous block was a blockquote so set that as this blocks parent
quote = sibling
else:
+ # This is a new blockquote. Create a new parent element.
quote = etree.SubElement(parent, 'blockquote')
+ # Recursively parse block with blockquote as parent.
self.parser.parseChunk(quote, block)
def clean(self, line):
- """ Remove ``>`` from begining of a line. """
+ """ Remove ``>`` from beginning of a line. """
m = self.RE.match(line)
if line.strip() == ">":
return ""
@@ -317,17 +393,22 @@ class OListProcessor(BlockProcessor):
""" Process ordered list blocks. """
TAG = 'ol'
+ # Detect an item (``1. item``). ``group(1)`` contains contents of item.
RE = re.compile(r'^[ ]{0,3}\d+\.[ ](.*)')
+ # Detect items on secondary lines. they can be of either list type.
CHILD_RE = re.compile(r'^[ ]{0,3}((\d+\.)|[*+-])[ ](.*)')
+ # Detect indented (nested) items of either type
INDENT_RE = re.compile(r'^[ ]{4,7}((\d+\.)|[*+-])[ ].*')
def test(self, parent, block):
return bool(self.RE.match(block))
def run(self, parent, blocks):
+ # Check fr multiple items in one block.
items = self.get_items(blocks.pop(0))
sibling = self.lastChild(parent)
if sibling and (sibling.tag == 'ol' or sibling.tag == 'ul'):
+ # Previous block was a list item, so set that as parent
lst = sibling
# make sure previous item is in a p.
if len(lst) and lst[-1].text and not len(lst[-1]):
@@ -341,12 +422,17 @@ class OListProcessor(BlockProcessor):
self.parser.parseBlocks(li, [firstitem])
self.parser.state.reset()
else:
+ # This is a new list so create parent with appropriate tag.
lst = etree.SubElement(parent, self.TAG)
self.parser.state.set('list')
+ # Loop through items in block, recursively parsing each with the
+ # appropriate parent.
for item in items:
if item.startswith(' '*4):
+ # Item is indented. Parse with last item as parent
self.parser.parseBlocks(lst[-1], [item])
else:
+ # New item. Create li and parse with it as parent
li = etree.SubElement(lst, 'li')
self.parser.parseBlocks(li, [item])
self.parser.state.reset()
@@ -357,13 +443,17 @@ class OListProcessor(BlockProcessor):
for line in block.split('\n'):
m = self.CHILD_RE.match(line)
if m:
+ # This is a new item. Append
items.append(m.group(3))
elif self.INDENT_RE.match(line):
+ # This is an indented (possibly nested) item.
if items[-1].startswith(' '*4):
+ # Previous item was indented. Append to that item.
items[-1] = '%s\n%s' % (items[-1], line)
else:
items.append(line)
else:
+ # This is another line of previous item. Append to that item.
items[-1] = '%s\n%s' % (items[-1], line)
return items
@@ -378,6 +468,7 @@ class UListProcessor(OListProcessor):
class HashHeaderProcessor(BlockProcessor):
""" Process Hash Headers. """
+ # Detect a header at start of any line in block
RE = re.compile(r'(^|\n)(?P<level>#{1,6})(?P<header>.*?)#*(\n|$)')
def test(self, parent, block):
@@ -387,21 +478,28 @@ class HashHeaderProcessor(BlockProcessor):
block = blocks.pop(0)
m = self.RE.search(block)
if m:
- before = block[:m.start()]
- after = block[m.end():]
+ before = block[:m.start()] # All lines before header
+ after = block[m.end():] # All lines after header
if before:
+ # As the header was not the first line of the block and the
+ # lines before the header must be parsed first,
+ # recursively parse this lines as a block.
self.parser.parseBlocks(parent, [before])
+ # Create header using named groups from RE
h = etree.SubElement(parent, 'h%d' % len(m.group('level')))
h.text = m.group('header').strip()
if after:
+ # Insert remaining lines as first block for future parsing.
blocks.insert(0, after)
else:
+ # This should never happen, but just in case...
message(CRITICAL, "We've got a problem header!")
-class SHeaderProcessor(BlockProcessor):
+class SetextHeaderProcessor(BlockProcessor):
""" Process Setext-style Headers. """
+ # Detect Setext-style header. Must be first 2 lines of block.
RE = re.compile(r'^.*?\n[=-]{3,}', re.MULTILINE)
def test(self, parent, block):
@@ -409,6 +507,7 @@ class SHeaderProcessor(BlockProcessor):
def run(self, parent, blocks):
lines = blocks.pop(0).split('\n')
+ # Determine level. ``=`` is 1 and ``-`` is 2.
if lines[1].startswith('='):
level = 1
else:
@@ -416,6 +515,7 @@ class SHeaderProcessor(BlockProcessor):
h = etree.SubElement(parent, 'h%d' % level)
h.text = lines[0].strip()
if len(lines) > 2:
+ # Block contains additional lines. Add to master blocks for later.
blocks.insert(0, '\n'.join(lines[2:]))
@@ -423,17 +523,18 @@ class HRProcessor(BlockProcessor):
""" Process Horizontal Rules. """
RE = r'[ ]{0,3}(?P<ch>[*_-])[ ]?((?P=ch)[ ]?){2,}[ ]*'
+ # Detect hr on any line of a block.
SEARCH_RE = re.compile(r'(^|\n)%s(\n|$)' % RE)
+ # Match a hr on a single line of text.
MATCH_RE = re.compile(r'^%s$' % RE)
def test(self, parent, block):
return bool(self.SEARCH_RE.search(block))
def run(self, parent, blocks):
- # Check for lines in block before hr.
- #import ipdb; ipdb.set_trace()
lines = blocks.pop(0).split('\n')
prelines = []
+ # Check for lines in block before hr.
for line in lines:
m = self.MATCH_RE.match(line)
if m:
@@ -441,18 +542,22 @@ class HRProcessor(BlockProcessor):
else:
prelines.append(line)
if len(prelines):
+ # Recursively parse lines before hr so they get parsed first.
self.parser.parseBlocks(parent, ['\n'.join(prelines)])
# create hr
hr = etree.SubElement(parent, 'hr')
# check for lines in block after hr.
lines = lines[len(prelines)+1:]
if len(lines):
+ # Add lines after hr to master blocks for later parsing.
blocks.insert(0, '\n'.join(lines))
class EmptyBlockProcessor(BlockProcessor):
""" Process blocks and start with an empty line. """
+ # Detect a block that only contains whitespace
+ # or only whitespace on the first line.
RE = re.compile(r'^\s*\n')
def test(self, parent, block):
@@ -462,14 +567,16 @@ class EmptyBlockProcessor(BlockProcessor):
block = blocks.pop(0)
m = self.RE.match(block)
if m:
+ # Add remaining line to master blocks for later.
blocks.insert(0, block[m.end():])
sibling = self.lastChild(parent)
if sibling and sibling.tag == 'pre' and sibling[0] and \
sibling[0].tag == 'code':
+ # Last block is a codeblock. Append to preserve whitespace.
sibling[0].text = AtomicString('%s/n/n/n' % sibling[0].text )
-class PBlockProcessor(BlockProcessor):
+class ParagraphProcessor(BlockProcessor):
""" Process Paragraph blocks. """
def test(self, parent, block):
@@ -478,17 +585,35 @@ class PBlockProcessor(BlockProcessor):
def run(self, parent, blocks):
block = blocks.pop(0)
if block.strip():
+ # Not a blank block. Add to parent, otherwise throw it away.
if self.parser.state.isstate('list'):
+ # The parent is a tight-list. Append to parent.text
if parent.text:
parent.text = '%s\n%s' % (parent.text, block)
else:
parent.text = block.lstrip()
else:
+ # Create a regular paragraph
p = etree.SubElement(parent, 'p')
p.text = block.lstrip()
class State(list):
- """ Track the current and nested stated of the parser. """
+ """ Track the current and nested state of the parser.
+
+ This utility class is used to track the state of the BlockParser and
+ support multiple levels if nesting. It's just a simple API wrapped around
+ a list. Each time a state is set, that state is appended to the end of the
+ list. Each time a state is reset, that state is removed from the end of
+ the list.
+
+ Therefore, each time a state is set for a nested block, that state must be
+ reset when we back out of that level of nesting or the state could be
+ corrupted.
+
+ While all the methods of a list object are available, only the three
+ defined below need be used.
+
+ """
def set(self, state):
""" Set a new state. """
@@ -499,14 +624,18 @@ class State(list):
self.pop()
def isstate(self, state):
- """ Test that top level is of given state. """
+ """ Test that top (current) level is of given state. """
if len(self):
return self[-1] == state
else:
return False
class BlockParser:
- """ Parse Markdown blocks into an ElementTree object. """
+ """ Parse Markdown blocks into an ElementTree object.
+
+ A wrapper class that stitches the various BlockProcessors together,
+ looping through them and creating an ElementTree object.
+ """
def __init__(self):
self.blockprocessors = OrderedDict()
@@ -514,27 +643,55 @@ class BlockParser:
self.blockprocessors['indent'] = ListIndentProcessor(self)
self.blockprocessors['code'] = CodeBlockProcessor(self)
self.blockprocessors['hashheader'] = HashHeaderProcessor(self)
- self.blockprocessors['sheader'] = SHeaderProcessor(self)
+ self.blockprocessors['setextheader'] = SetextHeaderProcessor(self)
self.blockprocessors['hr'] = HRProcessor(self)
self.blockprocessors['olist'] = OListProcessor(self)
self.blockprocessors['ulist'] = UListProcessor(self)
self.blockprocessors['quote'] = BlockQuoteProcessor(self)
- self.blockprocessors['paragraph'] = PBlockProcessor(self)
+ self.blockprocessors['paragraph'] = ParagraphProcessor(self)
self.state = State()
def parseDocument(self, lines):
- """ Parse a markdown string into an ElementTree. """
+ """ Parse a markdown document into an ElementTree.
+
+ Given a list of lines, an ElementTree object (not just a parent Element)
+ is created and the root element is passed to the parser as the parent.
+ The ElementTree object is returned.
+
+ This should only be called on an entire document, not pieces.
+
+ """
# Create a ElementTree from the lines
root = etree.Element("div")
self.parseChunk(root, '\n'.join(lines))
return etree.ElementTree(root)
def parseChunk(self, parent, text):
- """ Parse a chunk of markdown text and attach to given etree node. """
+ """ Parse a chunk of markdown text and attach to given etree node.
+
+ While the ``text`` argument is generally assumed to contain multiple
+ blocks which will be split on blank lines, it could contain only one
+ block. Generally, this method would be called by extensions when
+ block parsing is required.
+
+ The ``parent`` etree Element passed in is altered in place.
+ Nothing is returned.
+
+ """
self.parseBlocks(parent, text.split('\n\n'))
def parseBlocks(self, parent, blocks):
- """ Process blocks of markdown text and attach to given etree node. """
+ """ Process blocks of markdown text and attach to given etree node.
+
+ Given a list of ``blocks``, each blockprocessor is stepped through
+ until there are no blocks left. While an extension could potentially
+ call this method directly, it's generally expected to be used internally.
+
+ This is a public method as an extension may need to add/alter additional
+ BlockProcessors which call this method to recursively parse a nested
+ block.
+
+ """
while blocks:
for processor in self.blockprocessors.values():
if processor.test(parent, blocks[0]):