aboutsummaryrefslogtreecommitdiffstats
path: root/markdown.py
diff options
context:
space:
mode:
authorYuri Takhteyev <yuri@freewisdom.org>2008-10-12 20:40:48 -0700
committerYuri Takhteyev <yuri@freewisdom.org>2008-10-12 20:40:48 -0700
commit2d349a1f5dc4b55f2d2bcd7b9844d12ed0d31081 (patch)
treea00c5dbb3c825e546f686c65116d8c74a36a324c /markdown.py
parent8b6b7b0a39321dadfcab4d0a16053377c4715bee (diff)
downloadmarkdown-2d349a1f5dc4b55f2d2bcd7b9844d12ed0d31081.tar.gz
markdown-2d349a1f5dc4b55f2d2bcd7b9844d12ed0d31081.tar.bz2
markdown-2d349a1f5dc4b55f2d2bcd7b9844d12ed0d31081.zip
Made private methods actually private (to keep us honest) and removed
unnecessary whitespace.
Diffstat (limited to 'markdown.py')
-rwxr-xr-xmarkdown.py880
1 files changed, 427 insertions, 453 deletions
diff --git a/markdown.py b/markdown.py
index ae8dc10..dc5a9b6 100755
--- a/markdown.py
+++ b/markdown.py
@@ -32,9 +32,9 @@ Limberg](http://achinghead.com/) and [Artem Yunusov](http://blog.splyer.com).
Contact: markdown@freewisdom.org
-Copyright 2007, 2008 The Python Markdown Project (v. 1.7 and later)
-Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
-Copyright 2004 Manfred Stienstra (the original version)
+Copyright 2007, 2008 The Python Markdown Project (v. 1.7 and later)
+Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
+Copyright 2004 Manfred Stienstra (the original version)
License: BSD (see docs/LICENSE for details).
"""
@@ -62,7 +62,7 @@ Constants you might want to modify
"""
# default logging level for command-line use
-COMMAND_LINE_LOGGING_LEVEL = CRITICAL
+COMMAND_LINE_LOGGING_LEVEL = CRITICAL
TAB_LENGTH = 4 # expand tabs to this many spaces
ENABLE_ATTRIBUTES = True # @id = xyz -> <... id="xyz">
SMART_EMPHASIS = True # this_or_that does not become this<i>or</i>that
@@ -95,7 +95,7 @@ HTML_PLACEHOLDER_PREFIX = STX+"wzxhzdk:"
HTML_PLACEHOLDER = HTML_PLACEHOLDER_PREFIX + "%d" + ETX
INLINE_PLACEHOLDER_PREFIX = STX+"klzzwxh:"
INLINE_PLACEHOLDER = INLINE_PLACEHOLDER_PREFIX + "%s" + ETX
-AMP_SUBSTITUTE = STX+"amp"+ETX
+AMP_SUBSTITUTE = STX+"amp"+ETX
def wrapRe(raw_re) : return re.compile("^%s$" % raw_re, re.DOTALL)
CORE_RE = {
@@ -124,15 +124,15 @@ AUXILIARY GLOBAL FUNCTIONS
def message(level, text):
""" A wrapper method for logging debug messages. """
logging.getLogger('MARKDOWN').log(level, text)
-
+
def isString(s):
""" Check if it's string """
return isinstance(s, unicode) or isinstance(s, str)
-## Import
-def importETree():
+## Import
+def importETree():
"""Import the best implementation of ElementTree, return a module object."""
- etree_in_c = None
+ etree_in_c = None
try: # Is it Python 2.5+ with C implemenation of ElementTree installed?
import xml.etree.cElementTree as etree_in_c
except ImportError:
@@ -155,9 +155,9 @@ def importETree():
elif etree.VERSION < "1.1":
message(CRITICAL, "For ElementTree version 1.1 or higher is required")
sys.exit(1)
- else :
+ else :
return etree
-
+
def isBlockLevel(tag):
"""Check if the tag is a block level HTML tag."""
return BLOCK_LEVEL_ELEMENTS.match(tag)
@@ -229,9 +229,8 @@ class MarkdownParser:
buffer.append(line)
self.parseChunk(root, buffer)
-
- return etree.ElementTree(root)
+ return etree.ElementTree(root)
def parseChunk(self, parent_elem, lines, inList=0, looseList=0):
"""Process a chunk of markdown-formatted text and attach the parse to
@@ -244,30 +243,28 @@ class MarkdownParser:
lower-level markup is processed recursively.
Keyword arguments:
-
- * parent_elem: A ElementTree element to which the content will be added.
+
+ * parent_elem: The ElementTree element to which the content will be
+ added.
* lines: a list of lines
* inList: a level
-
+
Returns: None
-
+
"""
# Loop through lines until none left.
while lines:
-
# Skipping empty line
if not lines[0]:
lines = lines[1:]
continue
-
- # Check if this section starts with a list, a blockquote or
- # a code block
-
- processFn = { 'ul': self._processUList,
- 'ol': self._processOList,
- 'quoted': self._processQuote,
- 'tabbed': self._processCodeBlock}
+ # Check if this section starts with a list, a blockquote or
+ # a code block. If so, process them.
+ processFn = { 'ul': self.__processUList,
+ 'ol': self.__processOList,
+ 'quoted': self.__processQuote,
+ 'tabbed': self.__processCodeBlock}
for regexp in ['ul', 'ol', 'quoted', 'tabbed']:
m = CORE_RE[regexp].match(lines[0])
if m:
@@ -290,40 +287,33 @@ class MarkdownParser:
#
if inList:
-
- start, lines = self._linesUntil(lines, (lambda line:
+ start, lines = self.__linesUntil(lines, (lambda line:
CORE_RE['ul'].match(line)
or CORE_RE['ol'].match(line)
or not line.strip()))
-
- self.parseChunk(parent_elem, start, inList-1, looseList=looseList)
+ self.parseChunk(parent_elem, start, inList-1,
+ looseList=looseList)
inList = inList-1
else: # Ok, so it's just a simple block
-
- paragraph, lines = self._linesUntil(lines, lambda line:
- not line.strip() or line[0] == '>')
-
+ test = lambda line: not line.strip() or line[0] == '>'
+ paragraph, lines = self.__linesUntil(lines, test)
if len(paragraph) and paragraph[0].startswith('#'):
- self._processHeader(parent_elem, paragraph)
-
- elif len(paragraph) and \
- CORE_RE["isline3"].match(paragraph[0]):
-
- self._processHR(parent_elem)
+ self.__processHeader(parent_elem, paragraph)
+ elif len(paragraph) and CORE_RE["isline3"].match(paragraph[0]):
+ self.__processHR(parent_elem)
lines = paragraph[1:] + lines
-
elif paragraph:
- self._processParagraph(parent_elem, paragraph,
+ self.__processParagraph(parent_elem, paragraph,
inList, looseList)
if lines and not lines[0].strip():
lines = lines[1:] # skip the first (blank) line
- def _processHR(self, parentElem):
+ def __processHR(self, parentElem):
hr = etree.SubElement(parentElem, "hr")
-
- def _processHeader(self, parentElem, paragraph):
+
+ def __processHeader(self, parentElem, paragraph):
m = CORE_RE['header'].match(paragraph[0])
if m:
level = len(m.group(1))
@@ -332,8 +322,7 @@ class MarkdownParser:
else:
message(CRITICAL, "We've got a problem header!")
-
- def _processParagraph(self, parentElem, paragraph, inList, looseList):
+ def __processParagraph(self, parentElem, paragraph, inList, looseList):
if ( parentElem.tag == 'li'
and not (looseList or parentElem.getchildren())):
@@ -347,48 +336,45 @@ class MarkdownParser:
el = etree.SubElement(parentElem, "p")
dump = []
-
+
# Searching for hr or header
for line in paragraph:
# it's hr
if CORE_RE["isline3"].match(line):
el.text = "\n".join(dump)
- self._processHR(el)
+ self.__processHR(el)
dump = []
# it's header
elif line.startswith("#"):
- el.text = "\n".join(dump)
- self._processHeader(parentElem, [line])
- dump = []
+ el.text = "\n".join(dump)
+ self.__processHeader(parentElem, [line])
+ dump = []
else:
dump.append(line)
if dump:
- text = "\n".join(dump)
+ text = "\n".join(dump)
el.text = text
- def _processUList(self, parentElem, lines, inList):
- self._processList(parentElem, lines, inList,
- listexpr='ul', tag = 'ul')
-
- def _processOList(self, parentElem, lines, inList):
- self._processList(parentElem, lines, inList,
- listexpr='ol', tag = 'ol')
+ def __processUList(self, parentElem, lines, inList):
+ self.__processList(parentElem, lines, inList, listexpr='ul', tag='ul')
+ def __processOList(self, parentElem, lines, inList):
+ self.__processList(parentElem, lines, inList, listexpr='ol', tag='ol')
- def _processList(self, parentElem, lines, inList, listexpr, tag):
+ def __processList(self, parentElem, lines, inList, listexpr, tag):
"""
Given a list of document lines starting with a list item,
finds the end of the list, breaks it up, and recursively
processes each list item and the remainder of the text file.
Keyword arguments:
-
+
* parentElem: A ElementTree element to which the content will be added
* lines: a list of lines
* inList: a level
-
+
Returns: None
-
+
"""
ul = etree.SubElement(parentElem, tag) # ul might actually be '<ol>'
@@ -399,9 +385,7 @@ class MarkdownParser:
item = -1
i = 0 # a counter to keep track of where we are
-
- for line in lines:
-
+ for line in lines:
loose = 0
if not line.strip():
# If we see a blank line, this _might_ be the end of the list
@@ -432,7 +416,6 @@ class MarkdownParser:
# while also detabing child elements if necessary
for expr in ['ul', 'ol', 'tabbed']:
-
m = CORE_RE[expr].match(line)
if m:
if expr in ['ul', 'ol']: # We are looking at a new item
@@ -443,7 +426,6 @@ class MarkdownParser:
item += 1
elif expr == 'tabbed': # This line needs to be detabbed
items[item].append(m.group(4)) #after the 'tab'
-
i += 1
break
else:
@@ -455,31 +437,28 @@ class MarkdownParser:
# Add the ElementTree elements
for item in items:
li = etree.SubElement(ul, "li")
-
self.parseChunk(li, item, inList + 1, looseList = looseList)
# Process the remaining part of the section
-
self.parseChunk(parentElem, lines[i:], inList)
-
- def _linesUntil(self, lines, condition):
- """
+ def __linesUntil(self, lines, condition):
+ """
A utility function to break a list of lines upon the
first line that satisfied a condition. The condition
argument should be a predicate function.
-
+
"""
i = -1
for line in lines:
i += 1
- if condition(line):
+ if condition(line):
break
else:
i += 1
return lines[:i], lines[i:]
- def _processQuote(self, parentElem, lines, inList):
+ def __processQuote(self, parentElem, lines, inList):
"""
Given a list of document lines starting with a quote finds
the end of the quote, unindents it and recursively
@@ -487,13 +466,13 @@ class MarkdownParser:
text file.
Keyword arguments:
-
+
* parentElem: ElementTree element to which the content will be added
* lines: a list of lines
* inList: a level
-
- Returns: None
-
+
+ Returns: None
+
"""
dequoted = []
i = 0
@@ -519,10 +498,7 @@ class MarkdownParser:
self.parseChunk(blockquote, dequoted, inList)
self.parseChunk(parentElem, lines[i:], inList)
-
-
-
- def _processCodeBlock(self, parentElem, lines, inList):
+ def __processCodeBlock(self, parentElem, lines, inList):
"""
Given a list of document lines starting with a code block
finds the end of the block, puts it into the ElementTree verbatim
@@ -530,35 +506,33 @@ class MarkdownParser:
the remainder of the text file.
Keyword arguments:
-
+
* parentElem: ElementTree element to which the content will be added
* lines: a list of lines
* inList: a level
-
+
Returns: None
-
- """
- detabbed, theRest = self.detectTabbed(lines)
+ """
+ detabbed, theRest = self.__detectTabbed(lines)
pre = etree.SubElement(parentElem, "pre")
code = etree.SubElement(pre, "code")
-
text = "\n".join(detabbed).rstrip()+"\n"
code.text = AtomicString(text)
- self.parseChunk(parentElem, theRest, inList)
+ self.parseChunk(parentElem, theRest, inList)
- def detectTabbed(self, lines):
+ def __detectTabbed(self, lines):
""" Find indented text and remove indent before further proccesing.
Keyword arguments:
-
+
* lines: an array of strings
* fn: a function that returns a substring of a string
if the string matches the necessary criteria
-
+
Returns: a list of post processes items and the unused
remainder of the original list
-
+
"""
items = []
item = -1
@@ -583,7 +557,7 @@ class MarkdownParser:
i += 1 # advance
# Find the next non-blank line
- for j in range(i, len(lines)):
+ for j in range(i, len(lines)):
if lines[j].strip():
next_line = lines[j]; break
else:
@@ -601,6 +575,275 @@ class MarkdownParser:
return items, lines[i:]
+"""
+INLINE PROCESSOR
+=============================================================================
+
+This class handles basic Markdown parsing. It doesn't concern itself with
+inline elements such as **bold** or *italics*, but rather just catches blocks,
+lists, quotes, etc.
+"""
+
+class InlineProcessor:
+ """
+ An auxiliary class to traverse a Markdown tree, applying inline patterns.
+ """
+
+ def __init__ (self, patterns):
+ self.__inlinePatterns = patterns
+ self.__placeholder_prefix = INLINE_PLACEHOLDER_PREFIX
+ self.__placeholder_suffix = ETX
+ self.__placeholder_length = 4 + len(self.__placeholder_prefix) \
+ + len(self.__placeholder_suffix)
+ self.__placeholder_re = re.compile(INLINE_PLACEHOLDER % r'([0-9]{4})')
+
+ def __makePlaceholder(self, type):
+ """ Generate a placeholder """
+ id = "%04d" % len(self.stashed_nodes)
+ hash = INLINE_PLACEHOLDER % id
+ return hash, id
+
+ def __findPlaceholder(self, data, index):
+ """
+ Extract id from data string, start from index
+
+ Keyword arguments:
+
+ * data: string
+ * index: index, from which we start search
+
+ Returns: placeholder id and string index, after the found placeholder.
+ """
+
+ m = self.__placeholder_re.search(data, index)
+ if m:
+ return m.group(1), m.end()
+ else:
+ return None, index + 1
+
+ def __stashNode(self, node, type):
+ """ Add node to stash """
+ placeholder, id = self.__makePlaceholder(type)
+ self.stashed_nodes[id] = node
+ return placeholder
+
+ def __handleInline(self, data, patternIndex=0):
+ """
+ Process string with inline patterns and replace it
+ with placeholders
+
+ Keyword arguments:
+
+ * data: A line of Markdown text
+ * patternIndex: The index of the inlinePattern to start with
+
+ Returns: String with placeholders.
+
+ """
+ if not isinstance(data, AtomicString):
+ startIndex = 0
+ while patternIndex < len(self.__inlinePatterns):
+ data, matched, startIndex = self.__applyPattern(
+ self.__inlinePatterns[patternIndex],
+ data, patternIndex, startIndex)
+ if not matched:
+ patternIndex += 1
+ return data
+
+ def __processElementText(self, node, subnode, isText=True):
+ """
+ Process placeholders in Element.text or Element.tail
+ of Elements popped from self.stashed_nodes.
+
+ Keywords arguments:
+
+ * node: parent node
+ * subnode: processing node
+ * isText: bool variable, True - it's text, False - it's tail
+
+ Returns: None
+
+ """
+ if isText:
+ text = subnode.text
+ subnode.text = None
+ else:
+ text = subnode.tail
+ subnode.tail = None
+
+ childResult = self.__processPlaceholders(text, subnode)
+
+ if not isText and node is not subnode:
+ pos = node.getchildren().index(subnode)
+ node.remove(subnode)
+ else:
+ pos = 0
+
+ childResult.reverse()
+ for newChild in childResult:
+ node.insert(pos, newChild)
+
+ def __processPlaceholders(self, data, parent):
+ """
+ Process string with placeholders and generate ElementTree tree.
+
+ Keyword arguments:
+
+ * data: string with placeholders instead of ElementTree elements.
+ * parent: Element, which contains processing inline data
+
+ Returns: list with ElementTree elements with applied inline patterns.
+ """
+ def linkText(text):
+ if text:
+ if result:
+ if result[-1].tail:
+ result[-1].tail += text
+ else:
+ result[-1].tail = text
+ else:
+ if parent.text:
+ parent.text += text
+ else:
+ parent.text = text
+
+ result = []
+ strartIndex = 0
+ while data:
+ index = data.find(self.__placeholder_prefix, strartIndex)
+ if index != -1:
+ id, phEndIndex = self.__findPlaceholder(data, index)
+
+ if self.stashed_nodes.has_key(id):
+ node = self.stashed_nodes.get(id)
+
+ if index > 0:
+ text = data[strartIndex:index]
+ linkText(text)
+
+ if not isString(node): # it's Element
+ for child in [node] + node.getchildren():
+ if child.tail:
+ if child.tail.strip():
+ self.__processElementText(node, child, False)
+ if child.text:
+ if child.text.strip():
+ self.__processElementText(child, child)
+ else: # it's just a string
+ linkText(node)
+ strartIndex = phEndIndex
+ continue
+
+ strartIndex = phEndIndex
+ result.append(node)
+
+ else: # wrong placeholder
+ end = index + len(prefix)
+ linkText(data[strartIndex:end])
+ strartIndex = end
+ else:
+ text = data[strartIndex:]
+ linkText(text)
+ data = ""
+
+ return result
+
+ def __applyPattern(self, pattern, data, patternIndex, startIndex=0):
+ """
+ Check if the line fits the pattern, create the necessary
+ elements, add it to stashed_nodes.
+
+ Keyword arguments:
+
+ * data: the text to be processed
+ * pattern: the pattern to be checked
+ * patternIndex: index of current pattern
+ * startIndex: string index, from which we starting search
+
+ Returns: String with placeholders instead of ElementTree elements.
+
+ """
+ match = pattern.getCompiledRegExp().match(data[startIndex:])
+ leftData = data[:startIndex]
+
+ if not match:
+ return data, False, 0
+
+ node = pattern.handleMatch(match)
+
+ if node is None:
+ return data, True, len(leftData) + match.span(len(match.groups()))[0]
+
+ if not isString(node):
+ if not isinstance(node.text, AtomicString):
+ # We need to process current node too
+ for child in [node] + node.getchildren():
+ if not isString(node):
+ if child.text:
+ child.text = self.__handleInline(child.text,
+ patternIndex + 1)
+ if child.tail:
+ child.tail = self.__handleInline(child.tail,
+ patternIndex)
+
+ placeholder = self.__stashNode(node, pattern.type())
+
+ return "%s%s%s%s" % (leftData,
+ match.group(1),
+ placeholder, match.groups()[-1]), True, 0
+
+ def applyInlinePatterns(self, markdownTree):
+ """Apply inline patterns to a parsed Markdown tree.
+
+ Iterate over ElementTree, find elements with inline tag, apply inline
+ patterns and append newly created Elements to tree. If you don't
+ want process your data with inline paterns, instead of normal string,
+ use subclass AtomicString:
+
+ node.text = AtomicString("data won't be processed with inline patterns")
+
+ Arguments:
+
+ * markdownTree: ElementTree object, representing Markdown tree.
+
+ Returns: ElementTree object with applied inline patterns.
+
+ """
+ self.stashed_nodes = {}
+
+ stack = [markdownTree.getroot()]
+
+ while stack:
+ currElement = stack.pop()
+ insertQueue = []
+ for child in currElement.getchildren():
+ if child.text and not isinstance(child.text, AtomicString):
+ text = child.text
+ child.text = None
+ lst = self.__processPlaceholders(self.__handleInline(
+ text), child)
+ stack += lst
+ insertQueue.append((child, lst))
+
+ if child.getchildren():
+ stack.append(child)
+
+ for element, lst in insertQueue:
+ if element.text:
+ element.text = handleAttributes(element.text, element)
+ i = 0
+ for newChild in lst:
+ # Processing attributes
+ if newChild.tail:
+ newChild.tail = handleAttributes(newChild.tail,
+ element)
+ if newChild.text:
+ newChild.text = handleAttributes(newChild.text,
+ newChild)
+ element.insert(i, newChild)
+ i += 1
+
+ return markdownTree
"""
@@ -615,21 +858,21 @@ Preprocessor.
class TextPreprocessor:
"""
TextPreprocessors are run before the text is broken into lines.
-
+
Each TextPreprocessor implements a "run" method that takes a pointer to a
text string of the document, modifies it as necessary and returns
- either the same pointer or a pointer to a new string.
-
+ either the same pointer or a pointer to a new string.
+
TextPreprocessors must extend markdown.TextPreprocessor.
"""
def run(self, text):
- """
- Each subclass of TextPreprocessor should override the `run` method,
- which takes the document text as a single string and returns the
+ """
+ Each subclass of TextPreprocessor should override the `run` method,
+ which takes the document text as a single string and returns the
(possibly modified) document as a single string.
-
+
"""
pass
@@ -640,10 +883,10 @@ class Preprocessor:
Each preprocessor implements a "run" method that takes a pointer to a
list of lines of the document, modifies it as necessary and returns
- either the same pointer or a pointer to a new list.
-
+ either the same pointer or a pointer to a new list.
+
Preprocessors must extend markdown.Preprocessor.
-
+
"""
def run(self, lines):
@@ -654,17 +897,17 @@ class Preprocessor:
"""
pass
-
+
class HtmlBlockPreprocessor(TextPreprocessor):
"""Remove html blocks from the text and store them for later retrieval."""
right_tag_patterns = ["</%s>", "%s>"]
-
+
def _get_left_tag(self, block):
return block[1:].replace(">", " ", 1).split()[0].lower()
- def _get_right_tag(self, left_tag, block):
+ def _get_right_tag(self, left_tag, block):
for p in self.right_tag_patterns:
tag = p % left_tag
i = block.rfind(tag)
@@ -690,7 +933,7 @@ class HtmlBlockPreprocessor(TextPreprocessor):
def run(self, text):
new_blocks = []
- text = text.split("\n\n")
+ text = text.split("\n\n")
items = []
left_tag = ''
right_tag = ''
@@ -701,7 +944,7 @@ class HtmlBlockPreprocessor(TextPreprocessor):
if block.startswith("\n"):
block = block[1:]
text = text[1:]
-
+
if block.startswith("\n"):
block = block[1:]
@@ -709,7 +952,7 @@ class HtmlBlockPreprocessor(TextPreprocessor):
if block.startswith("<"):
left_tag = self._get_left_tag(block)
right_tag, data_index = self._get_right_tag(left_tag, block)
-
+
if data_index < len(block):
text.insert(0, block[data_index:])
block = block[:data_index]
@@ -722,13 +965,13 @@ class HtmlBlockPreprocessor(TextPreprocessor):
if self._is_oneliner(left_tag):
new_blocks.append(block.strip())
continue
-
+
if block[1] == "!":
# is a comment block
left_tag = "--"
right_tag, data_index = self._get_right_tag(left_tag, block)
# keep checking conditions below and maybe just append
-
+
if block.rstrip().endswith(">") \
and self._equal_tags(left_tag, right_tag):
new_blocks.append(
@@ -736,7 +979,7 @@ class HtmlBlockPreprocessor(TextPreprocessor):
continue
else: #if not block[1] == "!":
# if is block level tag and is not complete
-
+
if isBlockLevel(left_tag) or left_tag == "--" \
and not block.rstrip().endswith(">"):
items.append(block.strip())
@@ -744,16 +987,16 @@ class HtmlBlockPreprocessor(TextPreprocessor):
else:
new_blocks.append(
self.stash.store(block.strip()))
-
+
continue
new_blocks.append(block)
else:
items.append(block.strip())
-
+
right_tag, data_index = self._get_right_tag(left_tag, block)
-
+
if self._equal_tags(left_tag, right_tag):
# if find closing tag
in_tag = False
@@ -764,7 +1007,7 @@ class HtmlBlockPreprocessor(TextPreprocessor):
if items:
new_blocks.append(self.stash.store('\n\n'.join(items)))
new_blocks.append('\n')
-
+
return "\n\n".join(new_blocks)
HTML_BLOCK_PREPROCESSOR = HtmlBlockPreprocessor()
@@ -814,7 +1057,7 @@ class LinePreprocessor(Preprocessor):
for i in range(len(lines)):
prefix = ''
m = self.blockquote_re.search(lines[i])
- if m:
+ if m:
prefix = m.group(0)
if self._isLine(lines[i][len(prefix):]):
lines[i] = prefix + "___"
@@ -822,7 +1065,7 @@ class LinePreprocessor(Preprocessor):
def _isLine(self, block):
"""Determine if a block should be replaced with an <HR>"""
- if block.startswith(" "):
+ if block.startswith(" "):
return False # a code block
text = "".join([x for x in block if not x.isspace()])
if len(text) <= 2:
@@ -838,7 +1081,7 @@ LINE_PREPROCESSOR = LinePreprocessor()
class ReferencePreprocessor(Preprocessor):
- """Remove reference definitions from the text and store them for later use."""
+ """Remove reference definitions from the text and store them for later use."""
def run (self, lines):
new_text = [];
for line in lines:
@@ -863,8 +1106,6 @@ class ReferencePreprocessor(Preprocessor):
REFERENCE_PREPROCESSOR = ReferencePreprocessor()
-
-
"""
INLINE PATTERNS
=============================================================================
@@ -986,7 +1227,7 @@ class Pattern:
"""
pass
-
+
def type(self):
""" Return class name, to define pattern type """
return self.__class__.__name__
@@ -1002,10 +1243,10 @@ class SimpleTextPattern (Pattern):
return text
class SimpleTagPattern (Pattern):
- """
- Return element of type `tag` with a text attribute of group(3)
- of a Pattern.
-
+ """
+ Return element of type `tag` with a text attribute of group(3)
+ of a Pattern.
+
"""
def __init__ (self, pattern, tag):
Pattern.__init__(self, pattern)
@@ -1033,7 +1274,7 @@ class BacktickPattern (Pattern):
return el
-class DoubleTagPattern (SimpleTagPattern):
+class DoubleTagPattern (SimpleTagPattern):
"""Return a ElementTree element nested in tag2 nested in tag1.
Useful for strong emphasis etc.
@@ -1071,28 +1312,28 @@ class LinkPattern (Pattern):
el.set("href", self.sanitize_url(href.strip()))
else:
el.set("href", "")
-
+
if title:
title = dequote(title) #.replace('"', "&quot;")
el.set("title", title)
return el
def sanitize_url(self, url):
- """
+ """
Sanitize a url against xss attacks in "safe_mode".
Rather than specifically blacklisting `javascript:alert("XSS")` and all
its aliases (see <http://ha.ckers.org/xss.html>), we whitelist known
- safe url formats. Most urls contain a network location, however some
- are known not to (i.e.: mailto links). Script urls do not contain a
- location. Additionally, for `javascript:...`, the scheme would be
- "javascript" but some aliases will appear to `urlparse()` to have no
- scheme. On top of that relative links (i.e.: "foo/bar.html") have no
- scheme. Therefore we must check "path", "parameters", "query" and
- "fragment" for any literal colons. We don't check "scheme" for colons
+ safe url formats. Most urls contain a network location, however some
+ are known not to (i.e.: mailto links). Script urls do not contain a
+ location. Additionally, for `javascript:...`, the scheme would be
+ "javascript" but some aliases will appear to `urlparse()` to have no
+ scheme. On top of that relative links (i.e.: "foo/bar.html") have no
+ scheme. Therefore we must check "path", "parameters", "query" and
+ "fragment" for any literal colons. We don't check "scheme" for colons
because it *should* never have any and "netloc" must allow the form:
`username:password@host:port`.
-
+
"""
locless_schemes = ['', 'mailto', 'news']
scheme, netloc, path, params, query, fragment = url = urlparse(url)
@@ -1123,12 +1364,12 @@ class ImagePattern(LinkPattern):
el.set('src', "")
if len(src_parts) > 1:
el.set('title', dequote(" ".join(src_parts[1:])))
-
+
if ENABLE_ATTRIBUTES:
truealt = handleAttributes(m.group(2), el)
else:
truealt = m.group(2)
-
+
el.set('alt', truealt)
return el
@@ -1152,7 +1393,7 @@ class ReferencePattern(LinkPattern):
def makeTag(self, href, title, text):
el = etree.Element('a')
-
+
el.set('href', self.sanitize_url(href))
if title:
el.set('title', title)
@@ -1181,8 +1422,8 @@ class AutolinkPattern (Pattern):
return el
class AutomailPattern (Pattern):
- """
- Return a mailto link Element given an automail link (`<foo@example.com>`).
+ """
+ Return a mailto link Element given an automail link (`<foo@example.com>`).
"""
def handleMatch(self, m):
el = etree.Element('a')
@@ -1202,7 +1443,7 @@ class AutomailPattern (Pattern):
el.text = AtomicString(''.join(letters))
mailto = "mailto:" + email
- mailto = "".join([AMP_SUBSTITUTE + '#%d;' %
+ mailto = "".join([AMP_SUBSTITUTE + '#%d;' %
ord(letter) for letter in mailto])
el.set('href', mailto)
return el
@@ -1246,11 +1487,11 @@ There are two types of post-processors: Postprocessor and TextPostprocessor
class Postprocessor:
"""
Postprocessors are run before the ElementTree serialization.
-
+
Each Postprocessor implements a "run" method that takes a pointer to a
- ElementTree, modifies it as necessary and returns a ElementTree
+ ElementTree, modifies it as necessary and returns a ElementTree
document.
-
+
Postprocessors must extend markdown.Postprocessor.
"""
@@ -1266,18 +1507,18 @@ class Postprocessor:
class TextPostprocessor:
"""
TextPostprocessors are run after the ElementTree it converted back into text.
-
+
Each TextPostprocessor implements a "run" method that takes a pointer to a
text string, modifies it as necessary and returns a text string.
-
+
TextPostprocessors must extend markdown.TextPostprocessor.
-
+
"""
def run(self, text):
"""
Subclasses of TextPostprocessor should implement a `run` method, which
- takes the html document as a single text string and returns a
+ takes the html document as a single text string and returns a
(possibly modified) string.
"""
@@ -1389,291 +1630,27 @@ class HtmlStash:
document.
Keyword arguments:
-
+
* html: an html segment
* safe: label an html segment as safe for safemode
-
- Returns : a placeholder string
-
+
+ Returns : a placeholder string
+
"""
self.rawHtmlBlocks.append((html, safe))
placeholder = HTML_PLACEHOLDER % self.html_counter
self.html_counter += 1
return placeholder
-
+
def reset(self):
self.html_counter = 0
self.rawHtmlBlocks = []
-class InlineProcessor:
- """
- An auxiliary class to traverse a Markdown tree, applying inline patterns.
- """
-
- def __init__ (self, patterns):
- self.inlinePatterns = patterns
-
- self.__placeholder_prefix = INLINE_PLACEHOLDER_PREFIX
- self.__placeholder_suffix = ETX
- self.__placeholder_length = 4 + len(self.__placeholder_prefix) \
- + len(self.__placeholder_suffix)
- self.__placeholder_re = re.compile(INLINE_PLACEHOLDER % r'([0-9]{4})')
-
- def __makePlaceholder(self, type):
- """ Generate a placeholder """
- id = "%04d" % len(self.stashed_nodes)
- hash = INLINE_PLACEHOLDER % id
- return hash, id
-
- def __findPlaceholder(self, data, index):
- """
- Extract id from data string, start from index
-
- Keyword arguments:
-
- * data: string
- * index: index, from which we start search
-
- Returns: placeholder id and string index, after
- found placeholder
- """
- m = self.__placeholder_re.search(data, index)
- if m:
- return m.group(1), m.end()
- else:
- return None, index + 1
-
- def __stashNode(self, node, type):
- """ Add node to stash """
- placeholder, id = self.__makePlaceholder(type)
- self.stashed_nodes[id] = node
- return placeholder
-
- def __handleInline(self, data, patternIndex=0):
- """
- Process string with inline patterns and replace it
- with placeholders
-
- Keyword arguments:
-
- * data: A line of Markdown text
- * patternIndex: The index of the inlinePattern to start with
-
- Returns: String with placeholders.
-
- """
- if not isinstance(data, AtomicString):
- startIndex = 0
- while patternIndex < len(self.inlinePatterns):
- data, matched, startIndex = self.__applyPattern(
- self.inlinePatterns[patternIndex],
- data, patternIndex, startIndex)
- if not matched:
- patternIndex += 1
- return data
-
- def __processElementText(self, node, subnode, isText=True):
- """
- Process placeholders in Element.text or Element.tail
- of Elements popped from self.stashed_nodes.
-
- Keywords arguments:
-
- * node: parent node
- * subnode: processing node
- * isText: bool variable, True - it's text, False - it's tail
-
- Returns: None
-
- """
- if isText:
- text = subnode.text
- subnode.text = None
- else:
- text = subnode.tail
- subnode.tail = None
-
- childResult = self.__processPlaceholders(text, subnode)
-
- if not isText and node is not subnode:
- pos = node.getchildren().index(subnode)
- node.remove(subnode)
- else:
- pos = 0
-
- childResult.reverse()
- for newChild in childResult:
- node.insert(pos, newChild)
-
- def __processPlaceholders(self, data, parent):
- """
- Process string with placeholders and generate ElementTree tree.
-
- Keyword arguments:
-
- * data: string with placeholders instead of ElementTree elements.
- * parent: Element, which contains processing inline data
-
- Returns: list with ElementTree elements with applied inline patterns.
- """
- def linkText(text):
- if text:
- if result:
- if result[-1].tail:
- result[-1].tail += text
- else:
- result[-1].tail = text
- else:
- if parent.text:
- parent.text += text
- else:
- parent.text = text
-
- result = []
- strartIndex = 0
- while data:
- index = data.find(self.__placeholder_prefix, strartIndex)
- if index != -1:
- id, phEndIndex = self.__findPlaceholder(data, index)
-
- if self.stashed_nodes.has_key(id):
- node = self.stashed_nodes.get(id)
-
- if index > 0:
- text = data[strartIndex:index]
- linkText(text)
-
- if not isString(node): # it's Element
- for child in [node] + node.getchildren():
- if child.tail:
- if child.tail.strip():
- self.__processElementText(node, child, False)
- if child.text:
- if child.text.strip():
- self.__processElementText(child, child)
- else: # it's just a string
- linkText(node)
- strartIndex = phEndIndex
- continue
-
- strartIndex = phEndIndex
- result.append(node)
-
- else: # wrong placeholder
- end = index + len(prefix)
- linkText(data[strartIndex:end])
- strartIndex = end
- else:
- text = data[strartIndex:]
- linkText(text)
- data = ""
-
- return result
-
-
- def __applyPattern(self, pattern, data, patternIndex, startIndex=0):
- """
- Check if the line fits the pattern, create the necessary
- elements, add it to stashed_nodes.
-
- Keyword arguments:
-
- * data: the text to be processed
- * pattern: the pattern to be checked
- * patternIndex: index of current pattern
- * startIndex: string index, from which we starting search
-
- Returns: String with placeholders instead of ElementTree elements.
- """
- match = pattern.getCompiledRegExp().match(data[startIndex:])
- leftData = data[:startIndex]
-
- if not match:
- return data, False, 0
-
- node = pattern.handleMatch(match)
-
- if node is None:
- return data, True, len(leftData) + match.span(len(match.groups()))[0]
-
- if not isString(node):
- if not isinstance(node.text, AtomicString):
- # We need to process current node too
- for child in [node] + node.getchildren():
- if not isString(node):
- if child.text:
- child.text = self.__handleInline(child.text,
- patternIndex + 1)
- if child.tail:
- child.tail = self.__handleInline(child.tail,
- patternIndex)
-
- placeholder = self.__stashNode(node, pattern.type())
-
- return "%s%s%s%s" % (leftData,
- match.group(1),
- placeholder, match.groups()[-1]), True, 0
-
-
- def applyInlinePatterns(self, markdownTree):
- """
- Iterate over ElementTree, find elements with inline tag, apply inline
- patterns and append newly created Elements to tree. If you don't
- want process your data with inline paterns, instead of normal string,
- use subclass AtomicString:
-
- node.text = AtomicString("data won't be processed with inline patterns")
-
- Arguments:
-
- * markdownTree: ElementTree object, representing Markdown tree.
-
- Returns: ElementTree object with applied inline patterns.
- """
- self.stashed_nodes = {}
-
- stack = [markdownTree.getroot()]
-
- while stack:
- currElement = stack.pop()
- insertQueue = []
- for child in currElement.getchildren():
- if child.text and not isinstance(child.text, AtomicString):
- text = child.text
- child.text = None
- lst = self.__processPlaceholders(self.__handleInline(
- text), child)
- stack += lst
- insertQueue.append((child, lst))
-
- if child.getchildren():
- stack.append(child)
-
- for element, lst in insertQueue:
- if element.text:
- element.text = handleAttributes(element.text, element)
- i = 0
- for newChild in lst:
- # Processing attributes
- if newChild.tail:
- newChild.tail = handleAttributes(newChild.tail,
- element)
- if newChild.text:
- newChild.text = handleAttributes(newChild.text,
- newChild)
- element.insert(i, newChild)
- i += 1
-
- return markdownTree
-
-
-
-
class Markdown:
"""Convert Markdown to HTML."""
- def __init__(self,
+ def __init__(self,
extensions=[],
extension_configs={},
safe_mode = False):
@@ -1681,14 +1658,14 @@ class Markdown:
Creates a new Markdown instance.
Keyword arguments:
-
- * extensions: A list of extensions.
- If they are of type string, the module mdx_name.py will be loaded.
- If they are a subclass of markdown.Extension, they will be used
+
+ * extensions: A list of extensions.
+ If they are of type string, the module mdx_name.py will be loaded.
+ If they are a subclass of markdown.Extension, they will be used
as-is.
* extension-configs: Configuration setting for extensions.
* safe_mode: Disallow raw html. One of "remove", "replace" or "escape".
-
+
"""
self.parser = MarkdownParser()
self.safeMode = safe_mode
@@ -1716,7 +1693,7 @@ class Markdown:
AMPSUBSTITUTETEXTPOSTPROCESSOR]
self.prePatterns = []
-
+
self.inlinePatterns = [
BACKTICK_PATTERN,
ESCAPE_PATTERN,
@@ -1737,28 +1714,25 @@ class Markdown:
EMPHASIS_PATTERN_2
# The order of the handlers matters!!!
]
-
+
self.inlineProcessor = InlineProcessor(self.inlinePatterns)
self.references = {}
self.htmlStash = HtmlStash()
-
-
self.registerExtensions(extensions = extensions,
configs = extension_configs)
-
self.reset()
def registerExtensions(self, extensions, configs):
- """
+ """
Register extensions with this instance of Markdown.
Keyword aurguments:
-
+
* extensions: A list of extensions, which can either
be strings or objects. See the docstring on Markdown.
- * configs: A dictionary mapping module names to config options.
-
+ * configs: A dictionary mapping module names to config options.
+
"""
for ext in extensions:
if isinstance(ext, basestring):
@@ -1865,12 +1839,12 @@ class Markdown:
* input: Name of source text file.
* output: Name of output file. Writes to stdout if `None`.
- * extensions: A list of extension names (may contain config args).
+ * extensions: A list of extension names (may contain config args).
* encoding: Encoding of input and output files. Defaults to utf-8.
* safe_mode: Disallow raw html. One of "remove", "replace" or "escape".
"""
-
+
encoding = encoding or "utf-8"
# Read the source
@@ -1899,8 +1873,8 @@ Extensions
class Extension:
""" Base class for extensions to subclass. """
def __init__(self, configs = {}):
- """Create an instance of an Extention.
-
+ """Create an instance of an Extention.
+
Keyword arguments:
* configs: A dict of configuration setting used by an Extension.
@@ -1923,9 +1897,9 @@ class Extension:
self.config[key][0] = value
def extendMarkdown(self, md, md_globals):
- """
- Add the various proccesors and patterns to the Markdown Instance.
-
+ """
+ Add the various proccesors and patterns to the Markdown Instance.
+
This method must be overriden by every extension.
Keyword arguments:
@@ -1940,10 +1914,10 @@ class Extension:
def load_extension(ext_name, configs = []):
"""Load extension by name, then return the module.
-
- The extension name may contain arguments as part of the string in the
+
+ The extension name may contain arguments as part of the string in the
following format: "extname(key1=value1,key2=value2)"
-
+
"""
# Parse extensions config params (ignore the order)
@@ -1991,7 +1965,7 @@ def load_extensions(ext_names):
# Extensions should use "markdown.etree" instead of "etree" (or do `from
# markdown import etree`). Do not import it by yourself.
-etree = importETree()
+etree = importETree()
"""
EXPORTED FUNCTIONS
@@ -2008,12 +1982,12 @@ def markdown(text,
This is a shortcut function for `Markdown` class to cover the most
basic use case. It initializes an instance of Markdown, loads the
- necessary extensions and runs the parser on the given text.
+ necessary extensions and runs the parser on the given text.
Keyword arguments:
* text: Markdown formatted text as Unicode or ASCII string.
- * extensions: A list of extensions or extension names (may contain config args).
+ * extensions: A list of extensions or extension names (may contain config args).
* safe_mode: Disallow raw html. One of "remove", "replace" or "escape".
Returns: An HTML document as a string.
@@ -2048,7 +2022,7 @@ Python 2.3 or higher required for advanced command line options.
For lower versions of Python use:
%s INPUT_FILE > OUTPUT_FILE
-
+
""" % EXECUTABLE_NAME_FOR_USAGE
def parse_options():
@@ -2071,7 +2045,7 @@ def parse_options():
parser = optparse.OptionParser(usage="%prog INPUTFILE [options]")
parser.add_option("-f", "--file", dest="filename",
- help="write output to OUTPUT_FILE",
+ help="write output to OUTPUT_FILE",
metavar="OUTPUT_FILE")
parser.add_option("-e", "--encoding", dest="encoding",
help="encoding for input and output files",)