aboutsummaryrefslogtreecommitdiffstats
path: root/markdown.py
diff options
context:
space:
mode:
authorYuri Takhteyev <yuri@freewisdom.org>2008-10-12 18:02:07 -0700
committerYuri Takhteyev <yuri@freewisdom.org>2008-10-12 18:02:07 -0700
commit8e7e2a254859f61ec2632a11725d99c5ea0c9f09 (patch)
treee29b3410f07364c80339cb1efbc6a9996d05e08d /markdown.py
parent9a7f507d5f40e44f9573347955e9f70169ed5990 (diff)
downloadmarkdown-8e7e2a254859f61ec2632a11725d99c5ea0c9f09.tar.gz
markdown-8e7e2a254859f61ec2632a11725d99c5ea0c9f09.tar.bz2
markdown-8e7e2a254859f61ec2632a11725d99c5ea0c9f09.zip
More cleanup. Refactored all the core parsing logic into a separate
class: MarkdownParser.
Diffstat (limited to 'markdown.py')
-rwxr-xr-xmarkdown.py1218
1 files changed, 620 insertions, 598 deletions
diff --git a/markdown.py b/markdown.py
index 562380a..0aa530d 100755
--- a/markdown.py
+++ b/markdown.py
@@ -97,6 +97,24 @@ INLINE_PLACEHOLDER_PREFIX = STX+"klzzwxh:"
INLINE_PLACEHOLDER = INLINE_PLACEHOLDER_PREFIX + "%s" + ETX
AMP_SUBSTITUTE = STX+"amp"+ETX
+def wrapRe(raw_re) : return re.compile("^%s$" % raw_re, re.DOTALL)
+CORE_RE = {
+ 'header': wrapRe(r'(#{1,6})[ \t]*(.*?)[ \t]*(#*)'), # # A title
+ 'reference-def': wrapRe(r'(\ ?\ ?\ ?)\[([^\]]*)\]:\s*([^ ]*)(.*)'),
+ # [Google]: http://www.google.com/
+ 'containsline': wrapRe(r'([-]*)$|^([=]*)'), # -----, =====, etc.
+ 'ol': wrapRe(r'[ ]{0,3}[\d]*\.\s+(.*)'), # 1. text
+ 'ul': wrapRe(r'[ ]{0,3}[*+-]\s+(.*)'), # "* text"
+ 'isline1': wrapRe(r'(\**)'), # ***
+ 'isline2': wrapRe(r'(\-*)'), # ---
+ 'isline3': wrapRe(r'(\_*)'), # ___
+ 'tabbed': wrapRe(r'((\t)|( ))(.*)'), # an indented line
+ 'quoted': wrapRe(r'[ ]{0,2}> ?(.*)'), # a quoted block ("> ...")
+ 'containsline': re.compile(r'^([-]*)$|^([=]*)$', re.M),
+ 'attr': re.compile("\{@([^\}]*)=([^\}]*)}") # {@id=123}
+}
+"""Basic and reusable regular expressions."""
+
"""
AUXILIARY GLOBAL FUNCTIONS
@@ -160,6 +178,432 @@ def dequote(string):
"""
+OVERALL DESIGN
+=============================================================================
+
+Markdown processing takes place in three steps:
+
+1. A bunch of "preprocessors" munge the input text.
+2. MarkdownParser() parses the high-level structural elements of the
+ pre-processed text into an ElementTree.
+3. A bunch of Patterns are run against the ElementTree, detecting inline
+ markup.
+4. Some extra use-defined post-processors are run.
+5. The output is written to a string.
+
+Those steps are put together by the Markdown() class.
+
+The code below is organized as follows:
+
+1. MarkdownParser class - does basic parsing.
+2. All the post-processors, patterns, etc.
+3. Markdown class - does the high-level wrapping.
+"""
+
+
+"""
+CORE MARKDOWN PARSER
+=============================================================================
+
+This class handles basic Markdown parsing. It doesn't concern itself with
+inline elements such as **bold** or *italics*, but rather just catches blocks,
+lists, quotes, etc.
+"""
+
+class MarkdownParser:
+ """Parser Markdown into a ElementTree."""
+
+ def __init__(self):
+ pass
+
+ def parseDocument(self, lines):
+ """Parse a markdown string into an ElementTree."""
+ # Create a ElementTree from the lines
+ root = etree.Element("div")
+ buffer = []
+ for line in lines:
+ if line.startswith("#"):
+ self.parseChunk(root, buffer)
+ buffer = [line]
+ else:
+ buffer.append(line)
+
+ self.parseChunk(root, buffer)
+
+ return etree.ElementTree(root)
+
+
+ def parseChunk(self, parent_elem, lines, inList=0, looseList=0):
+ """Process a chunk of markdown-formatted text and attach the parse to
+ an ElementTree node.
+
+ Process a section of a source document, looking for high
+ level structural elements like lists, block quotes, code
+ segments, html blocks, etc. Some those then get stripped
+ of their high level markup (e.g. get unindented) and the
+ lower-level markup is processed recursively.
+
+ Keyword arguments:
+
+ * parent_elem: A ElementTree element to which the content will be added.
+ * lines: a list of lines
+ * inList: a level
+
+ Returns: None
+
+ """
+ # Loop through lines until none left.
+ while lines:
+
+ # Skipping empty line
+ if not lines[0]:
+ lines = lines[1:]
+ continue
+
+ # Check if this section starts with a list, a blockquote or
+ # a code block
+
+ processFn = { 'ul': self._processUList,
+ 'ol': self._processOList,
+ 'quoted': self._processQuote,
+ 'tabbed': self._processCodeBlock}
+
+ for regexp in ['ul', 'ol', 'quoted', 'tabbed']:
+ m = CORE_RE[regexp].match(lines[0])
+ if m:
+ processFn[regexp](parent_elem, lines, inList)
+ return
+
+ # We are NOT looking at one of the high-level structures like
+ # lists or blockquotes. So, it's just a regular paragraph
+ # (though perhaps nested inside a list or something else). If
+ # we are NOT inside a list, we just need to look for a blank
+ # line to find the end of the block. If we ARE inside a
+ # list, however, we need to consider that a sublist does not
+ # need to be separated by a blank line. Rather, the following
+ # markup is legal:
+ #
+ # * The top level list item
+ #
+ # Another paragraph of the list. This is where we are now.
+ # * Underneath we might have a sublist.
+ #
+
+ if inList:
+
+ start, lines = self._linesUntil(lines, (lambda line:
+ CORE_RE['ul'].match(line)
+ or CORE_RE['ol'].match(line)
+ or not line.strip()))
+
+ self.parseChunk(parent_elem, start, inList-1, looseList=looseList)
+ inList = inList-1
+
+ else: # Ok, so it's just a simple block
+
+ paragraph, lines = self._linesUntil(lines, lambda line:
+ not line.strip() or line[0] == '>')
+
+ if len(paragraph) and paragraph[0].startswith('#'):
+ self._processHeader(parent_elem, paragraph)
+
+ elif len(paragraph) and \
+ CORE_RE["isline3"].match(paragraph[0]):
+
+ self._processHR(parent_elem)
+ lines = paragraph[1:] + lines
+
+ elif paragraph:
+ self._processParagraph(parent_elem, paragraph,
+ inList, looseList)
+
+ if lines and not lines[0].strip():
+ lines = lines[1:] # skip the first (blank) line
+
+ def _processHR(self, parentElem):
+ hr = etree.SubElement(parentElem, "hr")
+
+ def _processHeader(self, parentElem, paragraph):
+ m = CORE_RE['header'].match(paragraph[0])
+ if m:
+ level = len(m.group(1))
+ h = etree.SubElement(parentElem, "h%d" % level)
+ h.text = m.group(2).strip()
+ else:
+ message(CRITICAL, "We've got a problem header!")
+
+
+ def _processParagraph(self, parentElem, paragraph, inList, looseList):
+
+ if ( parentElem.tag == 'li'
+ and not (looseList or parentElem.getchildren())):
+
+ # If this is the first paragraph inside "li", don't
+ # put <p> around it - append the paragraph bits directly
+ # onto parentElem
+ el = parentElem
+ else:
+ # Otherwise make a "p" element
+ el = etree.SubElement(parentElem, "p")
+
+ dump = []
+
+ # Searching for hr or header
+ for line in paragraph:
+ # it's hr
+ if CORE_RE["isline3"].match(line):
+ el.text = "\n".join(dump)
+ self._processHR(el)
+ dump = []
+ # it's header
+ elif line.startswith("#"):
+ el.text = "\n".join(dump)
+ self._processHeader(parentElem, [line])
+ dump = []
+ else:
+ dump.append(line)
+ if dump:
+ text = "\n".join(dump)
+ el.text = text
+
+ def _processUList(self, parentElem, lines, inList):
+ self._processList(parentElem, lines, inList,
+ listexpr='ul', tag = 'ul')
+
+ def _processOList(self, parentElem, lines, inList):
+ self._processList(parentElem, lines, inList,
+ listexpr='ol', tag = 'ol')
+
+
+ def _processList(self, parentElem, lines, inList, listexpr, tag):
+ """
+ Given a list of document lines starting with a list item,
+ finds the end of the list, breaks it up, and recursively
+ processes each list item and the remainder of the text file.
+
+ Keyword arguments:
+
+ * parentElem: A ElementTree element to which the content will be added
+ * lines: a list of lines
+ * inList: a level
+
+ Returns: None
+
+ """
+ ul = etree.SubElement(parentElem, tag) # ul might actually be '<ol>'
+
+ looseList = 0
+
+ # Make a list of list items
+ items = []
+ item = -1
+
+ i = 0 # a counter to keep track of where we are
+
+ for line in lines:
+
+ loose = 0
+ if not line.strip():
+ # If we see a blank line, this _might_ be the end of the list
+ i += 1
+ loose = 1
+
+ # Find the next non-blank line
+ for j in range(i, len(lines)):
+ if lines[j].strip():
+ next = lines[j]
+ break
+ else:
+ # There is no more text => end of the list
+ break
+
+ # Check if the next non-blank line is still a part of the list
+
+ if ( CORE_RE[listexpr].match(next) or
+ CORE_RE['tabbed'].match(next) ):
+ # get rid of any white space in the line
+ items[item].append(line.strip())
+ looseList = loose or looseList
+ continue
+ else:
+ break # found end of the list
+
+ # Now we need to detect list items (at the current level)
+ # while also detabing child elements if necessary
+
+ for expr in ['ul', 'ol', 'tabbed']:
+
+ m = CORE_RE[expr].match(line)
+ if m:
+ if expr in ['ul', 'ol']: # We are looking at a new item
+ #if m.group(1) :
+ # Removed the check to allow for a blank line
+ # at the beginning of the list item
+ items.append([m.group(1)])
+ item += 1
+ elif expr == 'tabbed': # This line needs to be detabbed
+ items[item].append(m.group(4)) #after the 'tab'
+
+ i += 1
+ break
+ else:
+ items[item].append(line) # Just regular continuation
+ i += 1 # added on 2006.02.25
+ else:
+ i += 1
+
+ # Add the ElementTree elements
+ for item in items:
+ li = etree.SubElement(ul, "li")
+
+ self.parseChunk(li, item, inList + 1, looseList = looseList)
+
+ # Process the remaining part of the section
+
+ self.parseChunk(parentElem, lines[i:], inList)
+
+
+ def _linesUntil(self, lines, condition):
+ """
+ A utility function to break a list of lines upon the
+ first line that satisfied a condition. The condition
+ argument should be a predicate function.
+
+ """
+ i = -1
+ for line in lines:
+ i += 1
+ if condition(line):
+ break
+ else:
+ i += 1
+ return lines[:i], lines[i:]
+
+ def _processQuote(self, parentElem, lines, inList):
+ """
+ Given a list of document lines starting with a quote finds
+ the end of the quote, unindents it and recursively
+ processes the body of the quote and the remainder of the
+ text file.
+
+ Keyword arguments:
+
+ * parentElem: ElementTree element to which the content will be added
+ * lines: a list of lines
+ * inList: a level
+
+ Returns: None
+
+ """
+ dequoted = []
+ i = 0
+ blank_line = False # allow one blank line between paragraphs
+ for line in lines:
+ m = CORE_RE['quoted'].match(line)
+ if m:
+ dequoted.append(m.group(1))
+ i += 1
+ blank_line = False
+ elif not blank_line and line.strip() != '':
+ dequoted.append(line)
+ i += 1
+ elif not blank_line and line.strip() == '':
+ dequoted.append(line)
+ i += 1
+ blank_line = True
+ else:
+ break
+
+ blockquote = etree.SubElement(parentElem, "blockquote")
+
+ self.parseChunk(blockquote, dequoted, inList)
+ self.parseChunk(parentElem, lines[i:], inList)
+
+
+
+
+ def _processCodeBlock(self, parentElem, lines, inList):
+ """
+ Given a list of document lines starting with a code block
+ finds the end of the block, puts it into the ElementTree verbatim
+ wrapped in ("<pre><code>") and recursively processes the
+ the remainder of the text file.
+
+ Keyword arguments:
+
+ * parentElem: ElementTree element to which the content will be added
+ * lines: a list of lines
+ * inList: a level
+
+ Returns: None
+
+ """
+ detabbed, theRest = self.detectTabbed(lines)
+
+ pre = etree.SubElement(parentElem, "pre")
+ code = etree.SubElement(pre, "code")
+
+ text = "\n".join(detabbed).rstrip()+"\n"
+ code.text = AtomicString(text)
+ self.parseChunk(parentElem, theRest, inList)
+
+ def detectTabbed(self, lines):
+ """ Find indented text and remove indent before further proccesing.
+
+ Keyword arguments:
+
+ * lines: an array of strings
+ * fn: a function that returns a substring of a string
+ if the string matches the necessary criteria
+
+ Returns: a list of post processes items and the unused
+ remainder of the original list
+
+ """
+ items = []
+ item = -1
+ i = 0 # to keep track of where we are
+
+ def detab(line):
+ match = CORE_RE['tabbed'].match(line)
+ if match:
+ return match.group(4)
+
+ for line in lines:
+ if line.strip(): # Non-blank line
+ line = detab(line)
+ if line:
+ items.append(line)
+ i += 1
+ continue
+ else:
+ return items, lines[i:]
+
+ else: # Blank line: _maybe_ we are done.
+ i += 1 # advance
+
+ # Find the next non-blank line
+ for j in range(i, len(lines)):
+ if lines[j].strip():
+ next_line = lines[j]; break
+ else:
+ break # There is no more text; we are done.
+
+ # Check if the next non-blank line is tabbed
+ if detab(next_line): # Yes, more work to do.
+ items.append("")
+ continue
+ else:
+ break # No, we are done.
+ else:
+ i += 1
+
+ return items, lines[i:]
+
+
+
+
+"""
PRE-PROCESSORS
=============================================================================
@@ -1014,35 +1458,10 @@ class InlineStash:
""" Reset instance """
self._nodes = {}
-"""
-CORE MARKDOWN
-=============================================================================
-
-The core part is still quite messy, despite substantial refactoring. If you
-are thinking of extending the syntax, see first if you can do it through
-pre-processors, post-processors, inline patterns or a combination of the three.
-"""
-
-def _wrapRe(raw_re) : return re.compile("^%s$" % raw_re, re.DOTALL)
-CORE_RE = {
- 'header': _wrapRe(r'(#{1,6})[ \t]*(.*?)[ \t]*(#*)'), # # A title
- 'reference-def': _wrapRe(r'(\ ?\ ?\ ?)\[([^\]]*)\]:\s*([^ ]*)(.*)'),
- # [Google]: http://www.google.com/
- 'containsline': _wrapRe(r'([-]*)$|^([=]*)'), # -----, =====, etc.
- 'ol': _wrapRe(r'[ ]{0,3}[\d]*\.\s+(.*)'), # 1. text
- 'ul': _wrapRe(r'[ ]{0,3}[*+-]\s+(.*)'), # "* text"
- 'isline1': _wrapRe(r'(\**)'), # ***
- 'isline2': _wrapRe(r'(\-*)'), # ---
- 'isline3': _wrapRe(r'(\_*)'), # ___
- 'tabbed': _wrapRe(r'((\t)|( ))(.*)'), # an indented line
- 'quoted': _wrapRe(r'[ ]{0,2}> ?(.*)'), # a quoted block ("> ...")
- 'containsline': re.compile(r'^([-]*)$|^([=]*)$', re.M),
- 'attr': re.compile("\{@([^\}]*)=([^\}]*)}") # {@id=123}
-}
-
+
class Markdown:
- """Converts markdown to HTML."""
+ """Convert Markdown to HTML."""
def __init__(self,
extensions=[],
@@ -1061,7 +1480,7 @@ class Markdown:
* safe_mode: Disallow raw html. One of "remove", "replace" or "escape".
"""
- self.source = None
+ self.parser = MarkdownParser()
self.safeMode = safe_mode
self.registeredExtensions = []
self.docType = ""
@@ -1172,373 +1591,6 @@ class Markdown:
for pattern in self.inlinePatterns:
pattern.safe_mode = self.safeMode
- def _processSection(self, parent_elem, lines,
- inList=0, looseList=0):
- """
- Process a section of a source document, looking for high
- level structural elements like lists, block quotes, code
- segments, html blocks, etc. Some those then get stripped
- of their high level markup (e.g. get unindented) and the
- lower-level markup is processed recursively.
-
- Keyword arguments:
-
- * parent_elem: A ElementTree element to which the content will be added.
- * lines: a list of lines
- * inList: a level
-
- Returns: None
-
- """
- # Loop through lines until none left.
- while lines:
-
- # Skipping empty line
- if not lines[0]:
- lines = lines[1:]
- continue
-
- # Check if this section starts with a list, a blockquote or
- # a code block
-
- processFn = { 'ul': self._processUList,
- 'ol': self._processOList,
- 'quoted': self._processQuote,
- 'tabbed': self._processCodeBlock}
-
- for regexp in ['ul', 'ol', 'quoted', 'tabbed']:
- m = CORE_RE[regexp].match(lines[0])
- if m:
- processFn[regexp](parent_elem, lines, inList)
- return
-
- # We are NOT looking at one of the high-level structures like
- # lists or blockquotes. So, it's just a regular paragraph
- # (though perhaps nested inside a list or something else). If
- # we are NOT inside a list, we just need to look for a blank
- # line to find the end of the block. If we ARE inside a
- # list, however, we need to consider that a sublist does not
- # need to be separated by a blank line. Rather, the following
- # markup is legal:
- #
- # * The top level list item
- #
- # Another paragraph of the list. This is where we are now.
- # * Underneath we might have a sublist.
- #
-
- if inList:
-
- start, lines = self._linesUntil(lines, (lambda line:
- CORE_RE['ul'].match(line)
- or CORE_RE['ol'].match(line)
- or not line.strip()))
-
- self._processSection(parent_elem, start,
- inList - 1, looseList = looseList)
- inList = inList-1
-
- else: # Ok, so it's just a simple block
-
- paragraph, lines = self._linesUntil(lines, lambda line:
- not line.strip() or line[0] == '>')
-
- if len(paragraph) and paragraph[0].startswith('#'):
- self._processHeader(parent_elem, paragraph)
-
- elif len(paragraph) and \
- CORE_RE["isline3"].match(paragraph[0]):
-
- self._processHR(parent_elem)
- lines = paragraph[1:] + lines
-
- elif paragraph:
- self._processParagraph(parent_elem, paragraph,
- inList, looseList)
-
- if lines and not lines[0].strip():
- lines = lines[1:] # skip the first (blank) line
-
- def _processHR(self, parentElem):
- hr = etree.SubElement(parentElem, "hr")
-
- def _processHeader(self, parentElem, paragraph):
- m = CORE_RE['header'].match(paragraph[0])
- if m:
- level = len(m.group(1))
- h = etree.SubElement(parentElem, "h%d" % level)
- h.text = m.group(2).strip()
- else:
- message(CRITICAL, "We've got a problem header!")
-
-
- def _processParagraph(self, parentElem, paragraph, inList, looseList):
-
- if ( parentElem.tag == 'li'
- and not (looseList or parentElem.getchildren())):
-
- # If this is the first paragraph inside "li", don't
- # put <p> around it - append the paragraph bits directly
- # onto parentElem
- el = parentElem
- else:
- # Otherwise make a "p" element
- el = etree.SubElement(parentElem, "p")
-
- dump = []
-
- # Searching for hr or header
- for line in paragraph:
- # it's hr
- if CORE_RE["isline3"].match(line):
- el.text = "\n".join(dump)
- self._processHR(el)
- dump = []
- # it's header
- elif line.startswith("#"):
- el.text = "\n".join(dump)
- self._processHeader(parentElem, [line])
- dump = []
- else:
- dump.append(line)
- if dump:
- text = "\n".join(dump)
- el.text = text
-
- def _processUList(self, parentElem, lines, inList):
- self._processList(parentElem, lines, inList,
- listexpr='ul', tag = 'ul')
-
- def _processOList(self, parentElem, lines, inList):
- self._processList(parentElem, lines, inList,
- listexpr='ol', tag = 'ol')
-
-
- def _processList(self, parentElem, lines, inList, listexpr, tag):
- """
- Given a list of document lines starting with a list item,
- finds the end of the list, breaks it up, and recursively
- processes each list item and the remainder of the text file.
-
- Keyword arguments:
-
- * parentElem: A ElementTree element to which the content will be added
- * lines: a list of lines
- * inList: a level
-
- Returns: None
-
- """
- ul = etree.SubElement(parentElem, tag) # ul might actually be '<ol>'
-
- looseList = 0
-
- # Make a list of list items
- items = []
- item = -1
-
- i = 0 # a counter to keep track of where we are
-
- for line in lines:
-
- loose = 0
- if not line.strip():
- # If we see a blank line, this _might_ be the end of the list
- i += 1
- loose = 1
-
- # Find the next non-blank line
- for j in range(i, len(lines)):
- if lines[j].strip():
- next = lines[j]
- break
- else:
- # There is no more text => end of the list
- break
-
- # Check if the next non-blank line is still a part of the list
-
- if ( CORE_RE[listexpr].match(next) or
- CORE_RE['tabbed'].match(next) ):
- # get rid of any white space in the line
- items[item].append(line.strip())
- looseList = loose or looseList
- continue
- else:
- break # found end of the list
-
- # Now we need to detect list items (at the current level)
- # while also detabing child elements if necessary
-
- for expr in ['ul', 'ol', 'tabbed']:
-
- m = CORE_RE[expr].match(line)
- if m:
- if expr in ['ul', 'ol']: # We are looking at a new item
- #if m.group(1) :
- # Removed the check to allow for a blank line
- # at the beginning of the list item
- items.append([m.group(1)])
- item += 1
- elif expr == 'tabbed': # This line needs to be detabbed
- items[item].append(m.group(4)) #after the 'tab'
-
- i += 1
- break
- else:
- items[item].append(line) # Just regular continuation
- i += 1 # added on 2006.02.25
- else:
- i += 1
-
- # Add the ElementTree elements
- for item in items:
- li = etree.SubElement(ul, "li")
-
- self._processSection(li, item, inList + 1, looseList = looseList)
-
- # Process the remaining part of the section
-
- self._processSection(parentElem, lines[i:], inList)
-
-
- def _linesUntil(self, lines, condition):
- """
- A utility function to break a list of lines upon the
- first line that satisfied a condition. The condition
- argument should be a predicate function.
-
- """
- i = -1
- for line in lines:
- i += 1
- if condition(line):
- break
- else:
- i += 1
- return lines[:i], lines[i:]
-
- def _processQuote(self, parentElem, lines, inList):
- """
- Given a list of document lines starting with a quote finds
- the end of the quote, unindents it and recursively
- processes the body of the quote and the remainder of the
- text file.
-
- Keyword arguments:
-
- * parentElem: ElementTree element to which the content will be added
- * lines: a list of lines
- * inList: a level
-
- Returns: None
-
- """
- dequoted = []
- i = 0
- blank_line = False # allow one blank line between paragraphs
- for line in lines:
- m = CORE_RE['quoted'].match(line)
- if m:
- dequoted.append(m.group(1))
- i += 1
- blank_line = False
- elif not blank_line and line.strip() != '':
- dequoted.append(line)
- i += 1
- elif not blank_line and line.strip() == '':
- dequoted.append(line)
- i += 1
- blank_line = True
- else:
- break
-
- blockquote = etree.SubElement(parentElem, "blockquote")
-
- self._processSection(blockquote, dequoted, inList)
- self._processSection(parentElem, lines[i:], inList)
-
-
-
-
- def _processCodeBlock(self, parentElem, lines, inList):
- """
- Given a list of document lines starting with a code block
- finds the end of the block, puts it into the ElementTree verbatim
- wrapped in ("<pre><code>") and recursively processes the
- the remainder of the text file.
-
- Keyword arguments:
-
- * parentElem: ElementTree element to which the content will be added
- * lines: a list of lines
- * inList: a level
-
- Returns: None
-
- """
- detabbed, theRest = self.detectTabbed(lines)
-
- pre = etree.SubElement(parentElem, "pre")
- code = etree.SubElement(pre, "code")
-
- text = "\n".join(detabbed).rstrip()+"\n"
- code.text = AtomicString(text)
- self._processSection(parentElem, theRest, inList)
-
- def detectTabbed(self, lines):
- """ Find indented text and remove indent before further proccesing.
-
- Keyword arguments:
-
- * lines: an array of strings
- * fn: a function that returns a substring of a string
- if the string matches the necessary criteria
-
- Returns: a list of post processes items and the unused
- remainder of the original list
-
- """
- items = []
- item = -1
- i = 0 # to keep track of where we are
-
- def detab(line):
- match = CORE_RE['tabbed'].match(line)
- if match:
- return match.group(4)
-
- for line in lines:
- if line.strip(): # Non-blank line
- line = detab(line)
- if line:
- items.append(line)
- i += 1
- continue
- else:
- return items, lines[i:]
-
- else: # Blank line: _maybe_ we are done.
- i += 1 # advance
-
- # Find the next non-blank line
- for j in range(i, len(lines)):
- if lines[j].strip():
- next_line = lines[j]; break
- else:
- break # There is no more text; we are done.
-
- # Check if the next non-blank line is tabbed
- if detab(next_line): # Yes, more work to do.
- items.append("")
- continue
- else:
- break # No, we are done.
- else:
- i += 1
-
- return items, lines[i:]
-
def _handleInline(self, data, patternIndex=0):
"""
Process string with inline patterns and replace it
@@ -1563,50 +1615,8 @@ class Markdown:
if not matched:
patternIndex += 1
return data
-
- def _applyInline(self, pattern, data, patternIndex, startIndex=0):
- """
- Check if the line fits the pattern, create the necessary
- elements, add it to InlineStash
-
- Keyword arguments:
-
- * data: the text to be processed
- * pattern: the pattern to be checked
- * patternIndex: index of current pattern
- * startIndex: string index, from which we starting search
- Returns: String with placeholders instead of ElementTree elements.
- """
- match = pattern.getCompiledRegExp().match(data[startIndex:])
- leftData = data[:startIndex]
-
- if not match:
- return data, False, 0
- node = pattern.handleMatch(match)
-
- if node is None:
- return data, True, len(leftData) + match.span(len(match.groups()))[0]
-
- if not isString(node):
- if not isinstance(node.text, AtomicString):
- # We need to process current node too
- for child in [node] + node.getchildren():
- if not isString(node):
- if child.text:
- child.text = self._handleInline(child.text,
- patternIndex + 1)
- if child.tail:
- child.tail = self._handleInline(child.tail,
- patternIndex)
-
- pholder = self.inlineStash.add(node, pattern.type())
-
- return "%s%s%s%s" % (leftData,
- match.group(1),
- pholder, match.groups()[-1]), True, 0
-
def _processElementText(self, node, subnode, isText=True):
"""
Process placeholders in Element.text or Element.tail
@@ -1706,6 +1716,51 @@ class Markdown:
data = ""
return result
+
+
+ def _applyInline(self, pattern, data, patternIndex, startIndex=0):
+ """
+ Check if the line fits the pattern, create the necessary
+ elements, add it to InlineStash
+
+ Keyword arguments:
+
+ * data: the text to be processed
+ * pattern: the pattern to be checked
+ * patternIndex: index of current pattern
+ * startIndex: string index, from which we starting search
+
+ Returns: String with placeholders instead of ElementTree elements.
+ """
+ match = pattern.getCompiledRegExp().match(data[startIndex:])
+ leftData = data[:startIndex]
+
+ if not match:
+ return data, False, 0
+
+ node = pattern.handleMatch(match)
+
+ if node is None:
+ return data, True, len(leftData) + match.span(len(match.groups()))[0]
+
+ if not isString(node):
+ if not isinstance(node.text, AtomicString):
+ # We need to process current node too
+ for child in [node] + node.getchildren():
+ if not isString(node):
+ if child.text:
+ child.text = self._handleInline(child.text,
+ patternIndex + 1)
+ if child.tail:
+ child.tail = self._handleInline(child.tail,
+ patternIndex)
+
+ pholder = self.inlineStash.add(node, pattern.type())
+
+ return "%s%s%s%s" % (leftData,
+ match.group(1),
+ pholder, match.groups()[-1]), True, 0
+
def applyInlinePatterns(self, markdownTree):
"""
@@ -1756,66 +1811,36 @@ class Markdown:
return markdownTree
- def markdownToTree(self, source=None):
- """Create ElementTree, without applying inline paterns.
-
- Keyword arguments:
-
- * source: An ascii or unicode string of Markdown formated text.
+ def convert (self, source):
+ """Convert markdown to serialized XHTML."""
- Returns: ElementTree object.
- """
+ # Fixup the source text
+ if not source:
+ return u"" # a blank unicode string
try:
- self.source = unicode(self.source)
+ source = unicode(source)
except UnicodeDecodeError:
- message(CRITICAL, 'UnicodeDecodeError: Markdown only accepts unicode or ascii input.')
+ message(CRITICAL, 'UnicodeDecodeError: Markdown only accepts unicode or ascii input.')
return u""
-
- # Fixup the source text
- self.source = self.source.replace(STX, "")
- self.source = self.source.replace(ETX, "")
- self.source = self.source.replace("\r\n", "\n").replace("\r", "\n")
- self.source += "\n\n"
- self.source = self.source.expandtabs(TAB_LENGTH)
+ source = source.replace(STX, "")
+ source = source.replace(ETX, "")
+ source = source.replace("\r\n", "\n").replace("\r", "\n")
+ source += "\n\n"
+ source = source.expandtabs(TAB_LENGTH)
+
+ # Run the text preprocessors
for pp in self.textPreprocessors:
- self.source = pp.run(self.source)
+ source = pp.run(source)
- # Split into lines and run the preprocessors that will work with
- # self.lines
- self.lines = self.source.split("\n")
+ # Split into lines and run the line preprocessors.
+ self.lines = source.split("\n")
for prep in self.preprocessors :
self.lines = prep.run(self.lines)
- # Create a ElementTree from the lines
- self.root = etree.Element("div")
- buffer = []
- for line in self.lines:
- if line.startswith("#"):
- self._processSection(self.root, buffer)
- buffer = [line]
- else:
- buffer.append(line)
-
- self._processSection(self.root, buffer)
-
- return etree.ElementTree(self.root)
-
-
- def convert (self, source):
- """Convert markdown to serialized XHTML.
-
- Keyword arguments:
-
- * source: An ascii or unicode string of Markdown formated text.
-
- """
- self.source = source
- if not self.source:
- return u"" # a blank unicode string
+ # Parse the high-level elements.
+ tree = self.parser.parseDocument(self.lines)
- # Build a tree from the Markdown source and get its root.
- tree = self.markdownToTree(source)
root = self.applyInlinePatterns(tree).getroot()
# Run the post-processors
@@ -1836,98 +1861,47 @@ class Markdown:
return xml.strip()
- def __str__(self):
- """ Report info about instance. Markdown always returns unicode."""
- if self.source is None:
- status = 'in which no source text has been assinged.'
- else:
- status = 'which contains %d chars and %d line(s) of source.'%\
- (len(self.source), self.source.count('\n')+1)
- return 'An instance of "%s" %s'% (self.__class__, status)
-
- __unicode__ = convert # markdown should always return a unicode string
-
-
-"""
-EXPORTED FUNCTIONS
-=============================================================================
-
-Those are the two functions we really mean to export: markdown() and
-markdownFromFile().
-"""
-
-def markdownFromFile(input = None,
- output = None,
- extensions = [],
- encoding = None,
- safe = False):
- """Converts a markdown file and returns the HTML as a unicode string.
-
- Used from the command-line, although may be useful in other situations.
- Decodes the file using the provided encoding (defaults to utf-8), passes
- the file content to markdown, and outputs the html to either the provided
- filename or stdout in the same encoding as the source file.
-
- **Note:** This is the only place that decoding and encoding of unicode
- takes place in Python-Markdown. (All other code is unicode-in /
- unicode-out.)
-
- Keyword arguments:
-
- * input: Name of source text file.
- * output: Name of output file. Writes to stdout if `None`.
- * extensions: A list of extension names (may contain config args).
- * encoding: Encoding of input and output files. Defaults to utf-8.
- * safe_mode: Disallow raw html. One of "remove", "replace" or "escape".
-
- """
-
- encoding = encoding or "utf-8"
-
- # Read the source
- input_file = codecs.open(input, mode="r", encoding=encoding)
- text = input_file.read()
- input_file.close()
- text = text.lstrip(u'\ufeff') # remove the byte-order mark
-
- # Convert
- html = markdown(text, extensions, safe_mode = safe)
-
- # Write to file or stdout
- if output:
- output_file = codecs.open(output, "w", encoding=encoding)
- output_file.write(html)
- output_file.close()
- else:
- sys.stdout.write(html.encode(encoding))
-
-def markdown(text,
- extensions = [],
- safe_mode = False):
- """
- Convenience wrapper function for `Markdown` class.
-
- Useful in a typical use case. Initializes an instance of the `Markdown`
- class, loads any extensions and runs the parser on the given text.
+ def convertFile(input = None, output = None, encoding = None):
+ """Converts a markdown file and returns the HTML as a unicode string.
- Keyword arguments:
+ Decodes the file using the provided encoding (defaults to utf-8),
+ passes the file content to markdown, and outputs the html to either
+ the provided stream or the file with provided name, using the same
+ encoding as the source file.
- * text: An ascii or Unicode string of Markdown formatted text.
- * extensions: A list of extension names (may contain config args).
- * safe_mode: Disallow raw html. One of "remove", "replace" or "escape".
+ **Note:** This is the only place that decoding and encoding of unicode
+ takes place in Python-Markdown. (All other code is unicode-in /
+ unicode-out.)
- Returns: An HTML document as a string.
-
- """
- message(DEBUG, "in markdown.markdown(), received text:\n%s" % text)
-
- extensions = [load_extension(e) for e in extensions]
+ Keyword arguments:
- md = Markdown(extensions=extensions,
- safe_mode = safe_mode)
+ * input: Name of source text file.
+ * output: Name of output file. Writes to stdout if `None`.
+ * extensions: A list of extension names (may contain config args).
+ * encoding: Encoding of input and output files. Defaults to utf-8.
+ * safe_mode: Disallow raw html. One of "remove", "replace" or "escape".
- return md.convert(text)
+ """
+ encoding = encoding or "utf-8"
+
+ # Read the source
+ input_file = codecs.open(input, mode="r", encoding=encoding)
+ text = input_file.read()
+ input_file.close()
+ text = text.lstrip(u'\ufeff') # remove the byte-order mark
+
+ # Convert
+ html = self.convert(text)
+
+ # Write to file or stdout
+ if type(output) == type("string"):
+ output_file = codecs.open(output, "w", encoding=encoding)
+ output_file.write(html)
+ output_file.close()
+ else:
+ output.write(html.encode(encoding))
+
"""
Extensions
@@ -1966,65 +1940,113 @@ class Extension:
This method must be overriden by every extension.
- Ketword arguments:
+ Keyword arguments:
* md: The Markdown instance.
- * md_globals: All global variables availabel in the markdown module
- namespace.
+ * md_globals: Global variables in the markdown module namespace.
"""
pass
def load_extension(ext_name, configs = []):
- """
- Load extension by name, then return the module.
+ """Load extension by name, then return the module.
The extension name may contain arguments as part of the string in the
- following format:
-
- "extname(key1=value1,key2=value2)"
-
- Print an error message and exit on failure.
+ following format: "extname(key1=value1,key2=value2)"
"""
- # I am making the assumption that the order of config options
- # does not matter.
+ # Parse extensions config params (ignore the order)
configs = dict(configs)
- pos = ext_name.find("(")
+ pos = ext_name.find("(") # find the first "("
if pos > 0:
ext_args = ext_name[pos+1:-1]
ext_name = ext_name[:pos]
pairs = [x.split("=") for x in ext_args.split(",")]
configs.update([(x.strip(), y.strip()) for (x, y) in pairs])
+ # Setup the module names
ext_module = 'markdown_extensions'
- module_name = '.'.join([ext_module, ext_name])
- extension_module_name = '_'.join(['mdx', ext_name])
+ module_name_new_style = '.'.join([ext_module, ext_name])
+ module_name_old_style = '_'.join(['mdx', ext_name])
- try:
- module = __import__(module_name, {}, {}, [ext_module])
+ # Try loading the extention first from one place, then another
+ try: # New style (markdown_extensons.<extension>)
+ module = __import__(module_name_new_style, {}, {}, [ext_module])
except ImportError:
+ try: # Old style (mdx.<extension>)
+ module = __import__(module_name_old_style)
+ except ImportError:
+ pass
+
+ if module :
+ # If the module is loaded successfully, we expect it to define a
+ # function called makeExtension()
try:
- module = __import__(extension_module_name)
+ return module.makeExtension(configs.items())
except:
- message(WARN,
- "Failed loading extension '%s' from '%s' or '%s' "
- "- continuing without."
- % (ext_name, module_name, extension_module_name) )
- # Return a dummy (do nothing) Extension as silent failure
- return Extension(configs={})
-
- return module.makeExtension(configs.items())
+ message(WARN, "Failed to instantiate extension '%s'" % ext_name)
+ else:
+ message(WARN, "Failed loading extension '%s' from '%s' or '%s'"
+ % (ext_name, module_name_new_style, module_name_old_style))
+def load_extensions(ext_names):
+ """Loads multiple extensions"""
+ extensions = []
+ for ext_name in ext_names:
+ extension = load_extension(ext_name)
+ if extension:
+ extensions.append(extension)
# Extensions should use "markdown.etree" instead of "etree" (or do `from
# markdown import etree`). Do not import it by yourself.
etree = importETree()
+"""
+EXPORTED FUNCTIONS
+=============================================================================
+
+Those are the two functions we really mean to export: markdown() and
+markdownFromFile().
+"""
+
+def markdown(text,
+ extensions = [],
+ safe_mode = False):
+ """Convert a markdown string to HTML and return HTML as a unicode string.
+
+ This is a shortcut function for `Markdown` class to cover the most
+ basic use case. It initializes an instance of Markdown, loads the
+ necessary extensions and runs the parser on the given text.
+
+ Keyword arguments:
+
+ * text: Markdown formatted text as Unicode or ASCII string.
+ * extensions: A list of extensions or extension names (may contain config args).
+ * safe_mode: Disallow raw html. One of "remove", "replace" or "escape".
+
+ Returns: An HTML document as a string.
+
+ """
+ md = Markdown(extensions=load_extensions(extensions),
+ safe_mode = safe_mode)
+ return md.convert(text)
+
+
+def markdownFromFile(input = None,
+ output = None,
+ extensions = [],
+ encoding = None,
+ safe = False):
+
+
+ md = Markdown(extensions=load_extensions(extensions),
+ safe_mode = safe_mode)
+ md.convertFile(input, output, encoding)
+
"""
COMMAND-LINE SPECIFIC STUFF