path: root/markdown.py
diff options
Diffstat (limited to 'markdown.py')
1 files changed, 240 insertions, 401 deletions
diff --git a/markdown.py b/markdown.py
index b534151..b40093f 100755
--- a/markdown.py
+++ b/markdown.py
@@ -98,24 +98,6 @@ INLINE_PLACEHOLDER_PREFIX = STX+"klzzwxh:"
-def wrapRe(raw_re) : return re.compile("^%s$" % raw_re, re.DOTALL)
-CORE_RE = {
- 'header': wrapRe(r'(#{1,6})[ \t]*(.*?)[ \t]*(#*)'), # # A title
- 'reference-def': wrapRe(r'(\ ?\ ?\ ?)\[([^\]]*)\]:\s*([^ ]*)(.*)'),
- # [Google]: http://www.google.com/
- 'containsline': wrapRe(r'([-]*)$|^([=]*)'), # -----, =====, etc.
- 'ol': wrapRe(r'[ ]{0,3}[\d]*\.\s+(.*)'), # 1. text
- 'ul': wrapRe(r'[ ]{0,3}[*+-]\s+(.*)'), # "* text"
- 'isline1': wrapRe(r'(\**)'), # ***
- 'isline2': wrapRe(r'(\-*)'), # ---
- 'isline3': wrapRe(r'(\_*)'), # ___
- 'tabbed': wrapRe(r'((\t)|( ))(.*)'), # an indented line
- 'quoted': wrapRe(r'[ ]{0,2}> ?(.*)'), # a quoted block ("> ...")
- 'containsline': re.compile(r'^([-]*)$|^([=]*)$', re.M),
- 'attr': re.compile("\{@([^\}]*)=([^\}]*)}") # {@id=123}
-"""Basic and reusable regular expressions."""
@@ -163,11 +145,13 @@ def isBlockLevel(tag):
"""Check if the tag is a block level HTML tag."""
return BLOCK_LEVEL_ELEMENTS.match(tag)
+ATTR_RE = re.compile("\{@([^\}]*)=([^\}]*)}") # {@id=123}
def handleAttributes(text, parent):
"""Set values of an element based on attribute definitions ({@id=123})."""
def attributeCallback(match):
parent.set(match.group(1), match.group(2))
- return CORE_RE['attr'].sub(attributeCallback, text)
+ return ATTR_RE.sub(attributeCallback, text)
def dequote(string):
"""Remove quotes from around a string."""
@@ -211,369 +195,286 @@ inline elements such as **bold** or *italics*, but rather just catches blocks,
lists, quotes, etc.
-class MarkdownParser:
- """Parser Markdown into a ElementTree."""
+class BlockProcessor:
+ """ Base class for block processors. """
+ def __init__(self, parser=None):
+ self.parser = parser
- def __init__(self):
- pass
+ def lastChild(self, parent):
+ """ Return the last child of an etree element. """
+ if len(parent):
+ return parent[-1]
+ else:
+ return None
- def parseDocument(self, lines):
- """Parse a markdown string into an ElementTree."""
- # Create a ElementTree from the lines
- root = etree.Element("div")
- buffer = []
+ def detab(self, text):
+ """ Remove a tab from the front of each line of the given text. """
+ newtext = []
+ lines = text.split('\n')
for line in lines:
- if line.startswith("#"):
- self.parseChunk(root, buffer)
- buffer = [line]
+ if line.startswith(' '*4):
+ newtext.append(line[4:])
+ elif not line.strip():
+ newtext.append('')
- buffer.append(line)
- self.parseChunk(root, buffer)
- return etree.ElementTree(root)
+ break
+ return '\n'.join(newtext), '\n'.join(lines[len(newtext):])
- def parseChunk(self, parent_elem, lines, inList=0, looseList=0):
- """Process a chunk of markdown-formatted text and attach the parse to
- an ElementTree node.
+ def looseDetab(self, text):
+ """ Remove a tab from front of lines but allowing dedented lines. """
+ lines = text.split('\n')
+ for i in range(len(lines)):
+ if lines[i].startswith(' '*4):
+ lines[i] = lines[i][4:]
+ return '\n'.join(lines)
- Process a section of a source document, looking for high
- level structural elements like lists, block quotes, code
- segments, html blocks, etc. Some those then get stripped
- of their high level markup (e.g. get unindented) and the
- lower-level markup is processed recursively.
+ def test(self, parent, block):
+ """ Return boolean. Must be overriden by subclasses. """
+ pass
- Keyword arguments:
+ def run(self, parent, blocks):
+ """ Run processor. Must be overridden by subclasses. """
- * parent_elem: The ElementTree element to which the content will be
- added.
- * lines: a list of lines
- * inList: a level
- Returns: None
+class ListIndentProcessor(BlockProcessor):
+ """ Process children of list items. """
- """
- # Loop through lines until none left.
- while lines:
- # Skipping empty line
- if not lines[0]:
- lines = lines[1:]
- continue
+ def test(self, parent, block):
+ return block.startswith(' '*4) and parent[-1] and \
+ (parent[-1].tag == "ul" or parent[-1].tag == "ol")
- # Check if this section starts with a list, a blockquote or
- # a code block. If so, process them.
- processFn = { 'ul': self.__processUList,
- 'ol': self.__processOList,
- 'quoted': self.__processQuote,
- 'tabbed': self.__processCodeBlock}
- for regexp in ['ul', 'ol', 'quoted', 'tabbed']:
- m = CORE_RE[regexp].match(lines[0])
- if m:
- processFn[regexp](parent_elem, lines, inList)
- return
- # We are NOT looking at one of the high-level structures like
- # lists or blockquotes. So, it's just a regular paragraph
- # (though perhaps nested inside a list or something else). If
- # we are NOT inside a list, we just need to look for a blank
- # line to find the end of the block. If we ARE inside a
- # list, however, we need to consider that a sublist does not
- # need to be separated by a blank line. Rather, the following
- # markup is legal:
- #
- # * The top level list item
- #
- # Another paragraph of the list. This is where we are now.
- # * Underneath we might have a sublist.
- #
- if inList:
- start, lines = self.__linesUntil(lines, (lambda line:
- CORE_RE['ul'].match(line)
- or CORE_RE['ol'].match(line)
- or not line.strip()))
- self.parseChunk(parent_elem, start, inList-1,
- looseList=looseList)
- inList = inList-1
- else: # Ok, so it's just a simple block
- test = lambda line: not line.strip() or line[0] == '>'
- paragraph, lines = self.__linesUntil(lines, test)
- if len(paragraph) and paragraph[0].startswith('#'):
- self.__processHeader(parent_elem, paragraph)
- elif len(paragraph) and CORE_RE["isline3"].match(paragraph[0]):
- self.__processHR(parent_elem)
- lines = paragraph[1:] + lines
- elif paragraph:
- self.__processParagraph(parent_elem, paragraph,
- inList, looseList)
- if lines and not lines[0].strip():
- lines = lines[1:] # skip the first (blank) line
- def __processHR(self, parentElem):
- hr = etree.SubElement(parentElem, "hr")
- def __processHeader(self, parentElem, paragraph):
- m = CORE_RE['header'].match(paragraph[0])
- if m:
- level = len(m.group(1))
- h = etree.SubElement(parentElem, "h%d" % level)
- h.text = m.group(2).strip()
+ def run(self, parent, blocks):
+ block = blocks.pop(0)
+ sibling = self.lastChild(parent)
+ if len(sibling) and sibling[-1].tag == 'li':
+ self.parser.parseBlocks(sibling[-1], [self.looseDetab(block)])
- message(CRITICAL, "We've got a problem header!")
+ li = etree.SubElement(sibling, 'li')
+ self.parser.parseBlocks(li, [self.looseDetab(block)])
- def __processParagraph(self, parentElem, paragraph, inList, looseList):
- if ( parentElem.tag == 'li'
- and not (looseList or parentElem.getchildren())):
+class CodeBlockProcessor(BlockProcessor):
+ """ Process code blocks. """
- # If this is the first paragraph inside "li", don't
- # put <p> around it - append the paragraph bits directly
- # onto parentElem
- el = parentElem
+ def test(self, parent, block):
+ return block.startswith(' '*4)
+ def run(self, parent, blocks):
+ sibling = self.lastChild(parent)
+ block = blocks.pop(0)
+ theRest = ''
+ if sibling and sibling.tag == "pre" and len(sibling) \
+ and sibling[0].tag == "code":
+ code = sibling[0]
+ block, theRest = self.detab(block)
+ code.text = '%s\n%s\n' % (code.text, block.rstrip())
- # Otherwise make a "p" element
- el = etree.SubElement(parentElem, "p")
- dump = []
- # Searching for hr or header
- for line in paragraph:
- # it's hr
- if CORE_RE["isline3"].match(line):
- el.text = "\n".join(dump)
- self.__processHR(el)
- dump = []
- # it's header
- elif line.startswith("#"):
- el.text = "\n".join(dump)
- self.__processHeader(parentElem, [line])
- dump = []
- else:
- dump.append(line)
- if dump:
- text = "\n".join(dump)
- el.text = text
- def __processUList(self, parentElem, lines, inList):
- self.__processList(parentElem, lines, inList, listexpr='ul', tag='ul')
- def __processOList(self, parentElem, lines, inList):
- self.__processList(parentElem, lines, inList, listexpr='ol', tag='ol')
- def __processList(self, parentElem, lines, inList, listexpr, tag):
- """
- Given a list of document lines starting with a list item,
- finds the end of the list, breaks it up, and recursively
- processes each list item and the remainder of the text file.
- Keyword arguments:
- * parentElem: A ElementTree element to which the content will be added
- * lines: a list of lines
- * inList: a level
+ pre = etree.SubElement(parent, 'pre')
+ code = etree.SubElement(pre, 'code')
+ block, theRest = self.detab(block)
+ code.text = '%s\n' % block.rstrip()
+ if theRest:
+ blocks.insert(0, theRest)
- Returns: None
- """
- ul = etree.SubElement(parentElem, tag) # ul might actually be '<ol>'
- looseList = 0
- # Make a list of list items
- items = []
- item = -1
+class BlockQuoteProcessor(BlockProcessor):
- i = 0 # a counter to keep track of where we are
- for line in lines:
- loose = 0
- if not line.strip():
- # If we see a blank line, this _might_ be the end of the list
- i += 1
- loose = 1
- # Find the next non-blank line
- for j in range(i, len(lines)):
- if lines[j].strip():
- next = lines[j]
- break
- else:
- # There is no more text => end of the list
- break
+ RE = re.compile(r'^[ ]{0,3}>[ ](.*)')
- # Check if the next non-blank line is still a part of the list
+ def test(self, parent, block):
+ return bool(self.RE.match(block))
- if ( CORE_RE[listexpr].match(next) or
- CORE_RE['tabbed'].match(next) ):
- # get rid of any white space in the line
- items[item].append(line.strip())
- looseList = loose or looseList
- continue
- else:
- break # found end of the list
- # Now we need to detect list items (at the current level)
- # while also detabing child elements if necessary
- for expr in ['ul', 'ol', 'tabbed']:
- m = CORE_RE[expr].match(line)
- if m:
- if expr in ['ul', 'ol']: # We are looking at a new item
- #if m.group(1) :
- # Removed the check to allow for a blank line
- # at the beginning of the list item
- items.append([m.group(1)])
- item += 1
- elif expr == 'tabbed': # This line needs to be detabbed
- items[item].append(m.group(4)) #after the 'tab'
- i += 1
- break
- else:
- items[item].append(line) # Just regular continuation
- i += 1 # added on 2006.02.25
+ def run(self, parent, blocks):
+ block = '\n'.join([self.clean(line) for line in
+ blocks.pop(0).split('\n')])
+ sibling = self.lastChild(parent)
+ if sibling and sibling.tag == "blockquote":
+ quote = sibling
- i += 1
- # Add the ElementTree elements
- for item in items:
- li = etree.SubElement(ul, "li")
- self.parseChunk(li, item, inList + 1, looseList = looseList)
+ quote = etree.SubElement(parent, 'blockquote')
+ self.parser.parseBlocks(quote, [block])
- # Process the remaining part of the section
- self.parseChunk(parentElem, lines[i:], inList)
- def __linesUntil(self, lines, condition):
- """
- A utility function to break a list of lines upon the
- first line that satisfied a condition. The condition
- argument should be a predicate function.
- """
- i = -1
- for line in lines:
- i += 1
- if condition(line):
- break
+ def clean(self, line):
+ """ Remove ``>`` from begining of a line. """
+ m = self.RE.match(line)
+ if m:
+ return m.group(1)
+ elif line.strip() == ">":
+ return ""
+ else:
+ return line
+class OListProcessor(BlockProcessor):
+ """ Process ordered list blocks. """
+ TAG = 'ol'
+ RE = re.compile(r'^[ ]{0,3}\d+\.[ ](.*)')
+ def test(self, parent, block):
+ return bool(self.RE.match(block))
+ def run(self, parent, blocks):
+ items = self.get_items(blocks.pop(0))
+ sibling = self.lastChild(parent)
+ if sibling and sibling.tag == self.TAG:
+ lst = sibling
+ # make sure previous item is in a p.
+ if len(lst) and lst[-1].text and not len(lst[-1]):
+ p = etree.SubElement(lst[-1], 'p')
+ p.text = lst[-1].text
+ lst[-1].text = ''
+ # parse first block differently as it gets wrapped in a p.
+ li = etree.SubElement(lst, 'li')
+ self.parser.state = 'looselist'
+ firstitem = items.pop(0)
+ self.parser.parseBlocks(li, [firstitem])
+ self.parser.resetState()
- i += 1
- return lines[:i], lines[i:]
+ lst = etree.SubElement(parent, self.TAG)
+ self.parser.state = 'list'
+ for item in items:
+ li = etree.SubElement(lst, 'li')
+ self.parser.parseBlocks(li, [item])
+ self.parser.resetState()
- def __processQuote(self, parentElem, lines, inList):
- """
- Given a list of document lines starting with a quote finds
- the end of the quote, unindents it and recursively
- processes the body of the quote and the remainder of the
- text file.
+ def get_items(self, block):
+ """ Break a block into list items. """
+ items = []
+ for line in block.split('\n'):
+ m = self.RE.match(line)
+ if m:
+ items.append(m.group(1))
+ else:
+ items[-1] = '\n'.join([items[-1], line])
+ return items
- Keyword arguments:
- * parentElem: ElementTree element to which the content will be added
- * lines: a list of lines
- * inList: a level
+class UListProcessor(OListProcessor):
+ """ Process unordered list blocks. """
- Returns: None
+ TAG = 'ul'
+ RE = re.compile(r'^[ ]{0,3}[*+-][ ](.*)')
- """
- dequoted = []
- i = 0
- blank_line = False # allow one blank line between paragraphs
- for line in lines:
- m = CORE_RE['quoted'].match(line)
- if m:
- dequoted.append(m.group(1))
- i += 1
- blank_line = False
- elif not blank_line and line.strip() != '':
- dequoted.append(line)
- i += 1
- elif not blank_line and line.strip() == '':
- dequoted.append(line)
- i += 1
- blank_line = True
- else:
- break
- blockquote = etree.SubElement(parentElem, "blockquote")
+class HashHeaderProcessor(BlockProcessor):
+ """ Process Hash Headers. """
- self.parseChunk(blockquote, dequoted, inList)
- self.parseChunk(parentElem, lines[i:], inList)
+ RE = re.compile(r'^(#{1,6})(.*?)#*$')
- def __processCodeBlock(self, parentElem, lines, inList):
- """
- Given a list of document lines starting with a code block
- finds the end of the block, puts it into the ElementTree verbatim
- wrapped in ("<pre><code>") and recursively processes the
- the remainder of the text file.
+ def test(self, parent, block):
+ return block.startswith('#')
- Keyword arguments:
+ def run(self, parent, blocks):
+ lines = blocks.pop(0).split('\n')
+ line1 = lines.pop(0)
+ m = self.RE.match(line1)
+ if m:
+ h = etree.SubElement(parent, 'h%d' % len(m.group(1)))
+ h.text = m.group(2).strip()
+ else:
+ lines.insert(0, line1)
+ if len(lines):
+ blocks.insert(0, '\n'.join(lines))
- * parentElem: ElementTree element to which the content will be added
- * lines: a list of lines
- * inList: a level
- Returns: None
+class SHeaderProcessor(BlockProcessor):
+ """ Process Setext-style Headers. """
- """
- detabbed, theRest = self.detectTabbed(lines)
- pre = etree.SubElement(parentElem, "pre")
- code = etree.SubElement(pre, "code")
- text = "\n".join(detabbed).rstrip()+"\n"
- code.text = AtomicString(text)
- self.parseChunk(parentElem, theRest, inList)
+ RE = re.compile(r'^.*?\n[=-]{3,}', re.MULTILINE)
- def detectTabbed(self, lines):
- """ Find indented text and remove indent before further proccesing.
+ def test(self, parent, block):
+ return bool(self.RE.match(block))
- Keyword arguments:
+ def run(self, parent, blocks):
+ lines = blocks.pop(0).split('\n')
+ if lines[1].startswith('='):
+ level = 1
+ else:
+ level = 2
+ h = etree.SubElement(parent, 'h%d' % level)
+ h.text = lines[0].strip()
+ if len(lines) > 2:
+ blocks.insert(0, '\n'.join(lines[2:]))
- * lines: an array of strings
- * fn: a function that returns a substring of a string
- if the string matches the necessary criteria
- Returns: a list of post processes items and the unused
- remainder of the original list
+class HRProcessor(BlockProcessor):
+ """ Process Horizontal Rules. """
- """
- items = []
- item = -1
- i = 0 # to keep track of where we are
+ RE = re.compile(r'([*_-][ ]?){3,}')
- def detab(line):
- match = CORE_RE['tabbed'].match(line)
- if match:
- return match.group(4)
+ def test(self, parent, block):
+ return bool(self.RE.search(block))
+ def run(self, parent, blocks):
+ # Check for lines in block before hr.
+ lines = blocks.pop(0).split('\n')
+ prelines = []
for line in lines:
- if line.strip(): # Non-blank line
- line = detab(line)
- if line:
- items.append(line)
- i += 1
- continue
- else:
- return items, lines[i:]
+ m = self.RE.match(line)
+ if m:
+ break
+ else:
+ prelines.append(line)
+ if len(prelines):
+ self.parser.parseBlocks(parent, ['\n'.join(prelines)])
+ # create hr
+ hr = etree.SubElement(parent, 'hr')
+ # check for lines in block after hr.
+ lines = lines[len(prelines)+1:]
+ if len(lines):
+ blocks.insert(0, '\n'.join(lines))
+class PBlockProcessor(BlockProcessor):
+ """ Process Paragraph blocks. """
+ def test(self, parent, block):
+ return True
+ def run(self, parent, blocks):
+ block = blocks.pop(0)
+ if block.strip():
+ if self.parser.state == 'list':
+ parent.text = block
+ else:
+ p = etree.SubElement(parent, 'p')
+ p.text = block
- else: # Blank line: _maybe_ we are done.
- i += 1 # advance
- # Find the next non-blank line
- for j in range(i, len(lines)):
- if lines[j].strip():
- next_line = lines[j]; break
- else:
- break # There is no more text; we are done.
+class BlockParser:
+ """ Parse Markdown blocks into an ElementTree object. """
- # Check if the next non-blank line is tabbed
- if detab(next_line): # Yes, more work to do.
- items.append("")
- continue
- else:
- break # No, we are done.
- else:
- i += 1
+ def __init__(self):
+ self.blockprocessors = OrderedDict()
+ self.blockprocessors['indent'] = ListIndentProcessor(self)
+ self.blockprocessors['code'] = CodeBlockProcessor(self)
+ self.blockprocessors['hashheader'] = HashHeaderProcessor(self)
+ self.blockprocessors['sheader'] = SHeaderProcessor(self)
+ self.blockprocessors['hr'] = HRProcessor(self)
+ self.blockprocessors['olist'] = OListProcessor(self)
+ self.blockprocessors['ulist'] = UListProcessor(self)
+ self.blockprocessors['quote'] = BlockQuoteProcessor(self)
+ self.blockprocessors['paragraph'] = PBlockProcessor(self)
+ self.resetState()
+ def resetState(self):
+ self.state = ''
- return items, lines[i:]
+ def parseDocument(self, lines):
+ """ Parse a markdown string into an ElementTree. """
+ # Create a ElementTree from the lines
+ root = etree.Element("div")
+ blocks = '\n'.join(lines).split('\n\n')
+ self.parseBlocks(root, blocks)
+ return etree.ElementTree(root)
+ def parseBlocks(self, parent, blocks):
+ """ Process blocks of markdown text and attach to given etree node. """
+ while blocks:
+ for processor in self.blockprocessors.values():
+ if processor.test(parent, blocks[0]):
+ processor.run(parent, blocks)
+ break
@@ -725,75 +626,15 @@ class HtmlBlockPreprocessor(Preprocessor):
return new_text.split("\n")
-class HeaderPreprocessor(Preprocessor):
- """Replace underlined headers with hashed headers.
- (To avoid the need for lookahead later.)
- """
- def run (self, lines):
- i = -1
- while i+1 < len(lines):
- i = i+1
- if not lines[i].strip():
- continue
- if lines[i].startswith("#"):
- lines.insert(i+1, "\n")
- if (i+1 <= len(lines)
- and lines[i+1]
- and lines[i+1][0] in ['-', '=']):
- underline = lines[i+1].strip()
- if underline == "="*len(underline):
- lines[i] = "# " + lines[i].strip()
- lines[i+1] = ""
- elif underline == "-"*len(underline):
- lines[i] = "## " + lines[i].strip()
- lines[i+1] = ""
- return lines
-class LinePreprocessor(Preprocessor):
- """Convert HR lines to "___" format."""
- blockquote_re = re.compile(r'^(> )+')
- def run (self, lines):
- for i in range(len(lines)):
- prefix = ''
- m = self.blockquote_re.search(lines[i])
- if m:
- prefix = m.group(0)
- if self._isLine(lines[i][len(prefix):]):
- lines[i] = prefix + "___"
- return lines
- def _isLine(self, block):
- """Determine if a block should be replaced with an <HR>"""
- if block.startswith(" "):
- return False # a code block
- text = "".join([x for x in block if not x.isspace()])
- if len(text) <= 2:
- return False
- for pattern in ['isline1', 'isline2', 'isline3']:
- m = CORE_RE[pattern].match(text)
- if (m and m.group(1)):
- return True
- else:
- return False
+class ReferencePreprocessor(Preprocessor):
+ """ Remove reference definitions from text and store for later use. """
+ RE = re.compile(r'^(\ ?\ ?\ ?)\[([^\]]*)\]:\s*([^ ]*)(.*)$', re.DOTALL)
-class ReferencePreprocessor(Preprocessor):
- """Remove reference definitions from the text and store them for later use."""
def run (self, lines):
new_text = [];
for line in lines:
- m = CORE_RE['reference-def'].match(line)
+ m = self.RE.match(line)
if m:
id = m.group(2).strip().lower()
t = m.group(4).strip() # potential title
@@ -1776,7 +1617,7 @@ class Markdown:
* safe_mode: Disallow raw html. One of "remove", "replace" or "escape".
- self.parser = MarkdownParser()
+ self.parser = BlockParser()
self.safeMode = safe_mode
self.registeredExtensions = []
self.docType = ""
@@ -1784,8 +1625,6 @@ class Markdown:
self.preprocessors = OrderedDict()
self.preprocessors["html_block"] = HtmlBlockPreprocessor(self)
- self.preprocessors["header"] = HeaderPreprocessor(self)
- self.preprocessors["line"] = LinePreprocessor(self)
self.preprocessors["reference"] = ReferencePreprocessor(self)
# footnote preprocessor will be inserted with "<reference"