path: root/markdown.py
diff options
Diffstat (limited to 'markdown.py')
1 files changed, 578 insertions, 630 deletions
diff --git a/markdown.py b/markdown.py
index 86f83ab..059ac87 100644..100755
--- a/markdown.py
+++ b/markdown.py
@@ -33,13 +33,15 @@ __revision__ = "$Rev$"
-import re, sys, codecs
+import re, sys, codecs, htmlentitydefs
from urlparse import urlparse, urlunparse
from logging import getLogger, StreamHandler, Formatter, \
@@ -56,13 +58,59 @@ logger.addHandler(console_hndlr)
def message(level, text):
''' A wrapper method for logging debug messages. '''
logger.log(level, text)
+def isstr(s):
+ return isinstance(s, unicode) or isinstance(s, str)
+def importETree():
+ """ Imports best variant of ElementTree
+ and returns module object """
+ try:
+ # Python 2.5+
+ import xml.etree.cElementTree as etree
+ except ImportError:
+ try:
+ # Python 2.5+
+ import xml.etree.ElementTree as etree
+ except ImportError:
+ try:
+ # normal cElementTree install
+ import cElementTree as etree
+ except ImportError:
+ try:
+ # normal ElementTree install
+ import elementtree.ElementTree as etree
+ except ImportError:
+ message(CRITICAL,
+ "Failed to import ElementTree from any known place")
+ sys.exit(1)
+ return etree
+etree = importETree()
+def indentETree(elem, level=0):
+ if level > 1:
+ i = "\n" + (level-1)*" "
+ else:
+ i = "\n"
+ if len(elem):
+ if not elem.text or not elem.text.strip():
+ elem.text = i + " "
+ for e in elem:
+ indentETree(e, level+1)
+ if not e.tail or not e.tail.strip():
+ e.tail = i
+ if level and (not elem.tail or not elem.tail.strip()):
+ elem.tail = i
# --------------- CONSTANTS YOU MIGHT WANT TO MODIFY -----------------
TAB_LENGTH = 4 # expand tabs to this many spaces
ENABLE_ATTRIBUTES = True # @id = xyz -> <... id="xyz">
-SMART_EMPHASIS = 1 # this_or_that does not become this<i>or</i>that
+SMART_EMPHASIS = True # this_or_that does not become this<i>or</i>that
HTML_REMOVED_TEXT = "[HTML_REMOVED]" # text used instead of HTML in safe mode
RTL_BIDI_RANGES = ( (u'\u0590', u'\u07FF'),
@@ -107,6 +155,11 @@ EXECUTABLE_NAME_FOR_USAGE = "python markdown.py"
# --------------- CONSTANTS YOU _SHOULD NOT_ HAVE TO CHANGE ----------
+AND_SUBSTITUTE = unichr(2) + unichr(4) + unichr(3)
# a template for html placeholders
START = u'\u0001'
END = u'\u0002'
@@ -126,330 +179,23 @@ def isBlockLevel (tag):
return ( (tag in BLOCK_LEVEL_ELEMENTS) or
(tag[0] == 'h' and tag[1] in "0123456789") )
-========================== NANODOM ===================================
-The three classes below implement some of the most basic DOM
-methods. I use this instead of minidom because I need a simpler
-functionality and do not want to require additional libraries.
-Importantly, NanoDom does not do normalization, which is what we
-want. It also adds extra white space when converting DOM to string
-ENTITY_NORMALIZATION_EXPRESSIONS = [ (re.compile("&"), "&amp;"),
- (re.compile("<"), "&lt;"),
- (re.compile(">"), "&gt;")]
-ENTITY_NORMALIZATION_EXPRESSIONS_SOFT = [ (re.compile("&(?!\#)"), "&amp;"),
- (re.compile("<"), "&lt;"),
- (re.compile(">"), "&gt;"),
- (re.compile("\""), "&quot;")]
-def getBidiType(text):
- """
- Get Bi-directional text type. Used by TextNode to determine text direction.
- """
- if not text: return None
- ch = text[0]
- if not isinstance(ch, unicode) or not ch.isalpha():
- return None
+def codepoint2name(code):
+ """ Returns entity definition by code, or code
+ if there is no such entity definition"""
+ entity = htmlentitydefs.codepoint2name.get(code)
+ if entity:
+ return "%s%s;" % (AND_SUBSTITUTE, entity)
- for min, max in RTL_BIDI_RANGES:
- if ( ch >= min and ch <= max ):
- return "rtl"
- else:
- return "ltr"
-class Document:
- """
- Document root of the NanoDom. An instance stores DOM elements as children.
- """
- def __init__ (self):
- """ Create a NanoDom document. """
- self.bidi = "ltr"
- def appendChild(self, child):
- """ Add a dom element as a child of the document root. """
- self.documentElement = child
- child.isDocumentElement = True
- child.parent = self
- self.entities = {}
- def setBidi(self, bidi):
- """ Set text direction (right-left or left-right)."""
- if bidi:
- self.bidi = bidi
- def createElement(self, tag, textNode=None):
- """ Given a tag or textNode, return a dom element. """
- el = Element(tag)
- el.doc = self
- if textNode:
- el.appendChild(self.createTextNode(textNode))
- return el
- def createTextNode(self, text):
- """ Return given text as a TextNode. """
- node = TextNode(text)
- node.doc = self
- return node
- def createEntityReference(self, entity):
- """ Return an html entitry reference (i.e.: `&amp;`). """
- if entity not in self.entities:
- self.entities[entity] = EntityReference(entity)
- return self.entities[entity]
- def createCDATA(self, text):
- """ Return the given text as a CDATA node. """
- node = CDATA(text)
- node.doc = self
- return node
- def toxml (self):
- """ Convert document to xml and return a string. """
- return self.documentElement.toxml()
- def normalizeEntities(self, text, avoidDoubleNormalizing=False):
- """ Return the given text as an html entity (i.e.: `<` => `&gt;`). """
- if avoidDoubleNormalizing:
- else:
- for regexp, substitution in regexps:
- text = regexp.sub(substitution, text)
- return text
- def find(self, test):
- """ Return a list of descendants that pass the test function """
- return self.documentElement.find(test)
- def unlink(self):
- """ Cleanup: Remove all children from the document. """
- self.documentElement.unlink()
- self.documentElement = None
-class CDATA:
- """ CDATA node type of NanoDom. """
- type = "cdata"
- def __init__ (self, text):
- """ Create a CDATA node with given text. """
- self.text = text
- def handleAttributes(self):
- """ Not implemented for CDATA node type. """
- pass
- def toxml (self):
- """ Return CDATA node as a string. """
- return "<![CDATA[" + self.text + "]]>"
-class Element:
- """
- Element node type of Nanodom.
+ return "%s#%d;" % (AND_SUBSTITUTE, code)
- All html tags would most likely be represented as Elements.
- """
- type = "element"
- def __init__ (self, tag):
- """ Create an Element node instance. """
- self.nodeName = tag
- self.attributes = []
- self.attribute_values = {}
- self.childNodes = []
- self.bidi = None
- self.isDocumentElement = False
- def setBidi(self, bidi):
- """ Set text direction (i.e.: right-left or left-right). """
- if bidi:
- orig_bidi = self.bidi
- if not self.bidi or self.isDocumentElement:
- # Once the bidi is set don't change it (except for doc element)
- self.bidi = bidi
- self.parent.setBidi(bidi)
- def unlink(self):
- """ Cleanup: Remove all children of the Element. """
- for child in self.childNodes:
- if child.type == "element":
- child.unlink()
- self.childNodes = None
- def setAttribute(self, attr, value):
- """
- Assign an html/xml attribute to the Element (i.e.: id, class, href).
- """
- if not attr in self.attributes:
- self.attributes.append(attr)
- self.attribute_values[attr] = value
- def insertChild(self, position, child):
- """ Insert a child Element at the given position. """
- self.childNodes.insert(position, child)
- child.parent = self
- def removeChild(self, child):
- """ Remove the given child from the Element. """
- self.childNodes.remove(child)
- def replaceChild(self, oldChild, newChild):
- """ Replace an old child Element with a new child Element. """
- position = self.childNodes.index(oldChild)
- self.removeChild(oldChild)
- self.insertChild(position, newChild)
- def appendChild(self, child):
- """ Append a new child Element to the end of the child Elements. """
- self.childNodes.append(child)
- child.parent = self
- def handleAttributes(self):
- """ Not implemented for Element node type. """
- pass
- def find(self, test, depth=0):
- """ Returns a list of descendants that pass the test function """
- matched_nodes = []
- for child in self.childNodes:
- if test(child):
- matched_nodes.append(child)
- if child.type == "element":
- matched_nodes += child.find(test, depth+1)
- return matched_nodes
- def toxml(self):
- """ Return the Element and all children as a string. """
- for child in self.childNodes:
- child.handleAttributes()
- buffer = ""
- if self.nodeName in ['h1', 'h2', 'h3', 'h4']:
- buffer += "\n"
- elif self.nodeName in ['li']:
- buffer += "\n "
- # Process children FIRST, then do the attributes
- childBuffer = ""
- if self.childNodes or self.nodeName in ['blockquote']:
- childBuffer += ">"
- for child in self.childNodes:
- childBuffer += child.toxml()
- if self.nodeName == 'p':
- childBuffer += "\n"
- elif self.nodeName == 'li':
- childBuffer += "\n "
- childBuffer += "</%s>" % self.nodeName
- else:
- childBuffer += "/>"
- buffer += "<" + self.nodeName
- if self.nodeName in ['p', 'li', 'ul', 'ol',
- 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
- if not self.attribute_values.has_key("dir"):
- if self.bidi:
- bidi = self.bidi
- else:
- bidi = self.doc.bidi
- if bidi=="rtl":
- self.setAttribute("dir", "rtl")
- for attr in self.attributes:
- value = self.attribute_values[attr]
- value = self.doc.normalizeEntities(value,
- avoidDoubleNormalizing=True)
- buffer += ' %s="%s"' % (attr, value)
- # Now let's actually append the children
- buffer += childBuffer
- if self.nodeName in ['p', 'br ', 'li', 'ul', 'ol',
- 'h1', 'h2', 'h3', 'h4'] :
- buffer += "\n"
- return buffer
-class TextNode:
- """ A Text node type of the NanoDom. """
- type = "text"
- attrRegExp = re.compile(r'\{@([^\}]*)=([^\}]*)}') # {@id=123}
- def __init__ (self, text):
- """ Create a TextNode with the given text. """
- self.value = text
- def attributeCallback(self, match):
- """ Regex callback method to set attribute on parent. """
- self.parent.setAttribute(match.group(1), match.group(2))
- def handleAttributes(self):
- """ Parse and assign attributes to the parent Element. """
- self.value = self.attrRegExp.sub(self.attributeCallback, self.value)
- def toxml(self):
- """ Return the TextNode as a string. """
- text = self.value
- self.parent.setBidi(getBidiType(text))
+def handleAttributes(text, parent):
- if not text.startswith(HTML_PLACEHOLDER_PREFIX):
- if self.parent.nodeName == "p":
- text = text.replace("\n", "\n ")
- elif (self.parent.nodeName == "li"
- and self.parent.childNodes[0]==self):
- text = "\n " + text.replace("\n", "\n ")
- text = self.doc.normalizeEntities(text)
- return text
-class EntityReference:
- """ EntityReference node type of NanoDom. """
- type = "entity_ref"
- def __init__(self, entity):
- """ Create an EntityReference of the given entity. """
- self.entity = entity
- def handleAttributes(self):
- """ Not implemented for EntityReference. """
- pass
- def toxml(self):
- """ Return the EntityReference as a string. """
- return "&" + self.entity + ";"
+ def attributeCallback(match):
+ parent.set(match.group(1), match.group(2))
+ return RE.regExp['attr'].sub(attributeCallback, text)
@@ -613,7 +359,7 @@ class HeaderPreprocessor(Preprocessor):
Replace underlined headers with hashed headers to avoid
- the nead for lookahead later.
+ the need for lookahead later.
def run (self, lines):
@@ -656,29 +402,32 @@ class LinePreprocessor(Preprocessor):
blockquote_re = re.compile(r'^(> )+')
def run (self, lines):
- """ Find a store HR lines. """
+ """ Find and replace HR lines. """
for i in range(len(lines)):
prefix = ''
m = self.blockquote_re.search(lines[i])
- if m : prefix = m.group(0)
+ if m:
+ prefix = m.group(0)
if self._isLine(lines[i][len(prefix):]):
- lines[i] = prefix + self.stash.store("<hr />", safe=True)
+ #lines[i] = prefix + self.stash.store("<hr />", safe=True)
+ lines[i] = prefix + "___"
return lines
def _isLine(self, block):
"""Determine if a block should be replaced with an <HR>"""
- if block.startswith(" "): return 0 # a code block
+ if block.startswith(" "):
+ return False # a code block
text = "".join([x for x in block if not x.isspace()])
if len(text) <= 2:
- return 0
+ return False
for pattern in ['isline1', 'isline2', 'isline3']:
m = RE.regExp[pattern].match(text)
if (m and m.group(1)):
- return 1
+ return True
- return 0
+ return False
LINE_PREPROCESSOR = LinePreprocessor()
@@ -713,6 +462,7 @@ class ReferencePreprocessor(Preprocessor):
REFERENCE_PREPROCESSOR = ReferencePreprocessor()
========================== INLINE PATTERNS ===========================
@@ -725,9 +475,8 @@ expression and needs support the following methods:
pattern.getCompiledRegExp() - returns a regular expression
- pattern.handleMatch(m, doc) - takes a match object and returns
- a NanoDom node (as a part of the provided
- doc) or None
+ pattern.handleMatch(m) - takes a match object and returns
+ a ElementTree element or just plain text
All of python markdown's built-in patterns subclass from Pattern,
but you can add additional patterns that don't.
@@ -765,32 +514,31 @@ BRK = ( r'\[('
+ NOBRACKET + r')\]' )
NOIMG = r'(?<!\!)'
-BACKTICK_RE = r'\`([^\`]*)\`' # `e= m*c^2`
-DOUBLE_BACKTICK_RE = r'\`\`(.*?)\`\`' # ``e=f("`")``
+BACKTICK_RE = r'(?<!\\)(`+)(.+?)(?<!`)\2(?!`)' # `e=f()` or ``e=f("`")``
ESCAPE_RE = r'\\(.)' # \<
-EMPHASIS_RE = r'\*([^\*]*)\*' # *emphasis*
-STRONG_RE = r'\*\*(.*?)\*\*' # **strong**
-STRONG_EM_RE = r'\*\*\*(.*?)\*\*\*' # ***strong***
+EMPHASIS_RE = r'(\*)([^\*]*)\2' # *emphasis*
+STRONG_RE = r'(\*{2}|_{2})(.*?)\2' # **strong**
+STRONG_EM_RE = r'(\*{3}|_{3})(.*?)\2' # ***strong***
- EMPHASIS_2_RE = r'(?<!\S)_(\S[^_]*)_' # _emphasis_
+ EMPHASIS_2_RE = r'(?<!\S)(_)(\S.*?)\2' # _emphasis_
- EMPHASIS_2_RE = r'_([^_]*)_' # _emphasis_
+ EMPHASIS_2_RE = r'(_)(.*?)\2' # _emphasis_
-STRONG_2_RE = r'__(.*?)__' # __strong__
-STRONG_EM_2_RE = r'___(.*?)___' # ___strong___
+#LINK_RE = NOIMG + BRK + r'\s*\(([^\)]*)\)' # [text](url)
-LINK_RE = NOIMG + BRK + r'\s*\(([^\)]*)\)' # [text](url)
-LINK_ANGLED_RE = NOIMG + BRK + r'\s*\(<([^\)]*)>\)' # [text](<url>)
-IMAGE_LINK_RE = r'\!' + BRK + r'\s*\(([^\)]*)\)' # ![alttxt](http://x.com/)
+r'''\(\s*(<.*?>|((?:(?:\(.*?\))|[^\(\)]))*?)\s*((['"])(.*)\12)?\)''' # [text](url) or [text](<url>)
+IMAGE_LINK_RE = r'\!' + BRK + r'\s*\((<.*?>|([^\)]*))\)' # ![alttxt](http://x.com/) or ![alttxt](<http://x.com/>)
REFERENCE_RE = NOIMG + BRK+ r'\s*\[([^\]]*)\]' # [Google][3]
IMAGE_REFERENCE_RE = r'\!' + BRK + '\s*\[([^\]]*)\]' # ![alt text][2]
NOT_STRONG_RE = r'( \* )' # stand-alone * or _
AUTOLINK_RE = r'<((?:f|ht)tps?://[^>]*)>' # <http://www.123.com>
AUTOMAIL_RE = r'<([^> \!]*@[^> ]*)>' # <me@example.com>
#HTML_RE = r'(\<[^\>]*\>)' # <...>
-HTML_RE = r'(\<[a-zA-Z/][^\>]*\>)' # <...>
-ENTITY_RE = r'(&[\#a-zA-Z0-9]*;)' # &amp;
+HTML_RE = r'(\<([a-zA-Z/][^\>]*?|\!--.*?--)\>)' # <...>
+ENTITY_RE = r'(&[\#a-zA-Z0-9]*;)' # &amp;
LINE_BREAK_RE = r' \n' # two spaces at end of line
LINE_BREAK_2_RE = r' $' # two spaces at end of text
@@ -816,30 +564,35 @@ class Pattern:
""" Return a compiled regular expression. """
return self.compiled_re
- def handleMatch(self, m, doc):
+ def handleMatch(self, m):
- Return a NanoDom element from the given match. Subclasses should
+ Return a ElementTree element from the given match. Subclasses should
override this method.
Keyword arguments:
* m: A re match object containing a match of the pattern.
- * doc: An instance of a NanoDom Document.
+ def type(self):
+ """ Return class name, to define pattern type """
+ return self.__class__.__name__
BasePattern = Pattern # for backward compatibility
class SimpleTextPattern (Pattern):
- """ Return a simple TextNode of group(2) of a Pattern. """
- def handleMatch(self, m, doc):
- return doc.createTextNode(m.group(2))
+ """ Return a simple text of group(2) of a Pattern. """
+ def handleMatch(self, m):
+ text = m.group(2)
+ return None
+ return text
class SimpleTagPattern (Pattern):
- Return NanoDom Element of type `tag` with a child TextNode of group(2)
+ Return element of type `tag` with a text attribute of group(3)
of a Pattern.
@@ -847,71 +600,69 @@ class SimpleTagPattern (Pattern):
Pattern.__init__(self, pattern)
self.tag = tag
- def handleMatch(self, m, doc):
- el = doc.createElement(self.tag)
- el.appendChild(doc.createTextNode(m.group(2)))
+ def handleMatch(self, m):
+ el = etree.Element(self.tag)
+ el.text = m.group(3)
return el
class SubstituteTagPattern (SimpleTagPattern):
- """ Return a NanoDom ELement of type `tag` with no children. """
- def handleMatch (self, m, doc):
- return doc.createElement(self.tag)
+ """ Return a eLement of type `tag` with no children. """
+ def handleMatch (self, m):
+ return etree.Element(self.tag)
class BacktickPattern (Pattern):
- """ Return a NanoDom `<code>` Element containing the matching text. """
+ """ Return a `<code>` element containing the matching text. """
def __init__ (self, pattern):
Pattern.__init__(self, pattern)
self.tag = "code"
- def handleMatch(self, m, doc):
- el = doc.createElement(self.tag)
- text = m.group(2).strip()
- #text = text.replace("&", "&amp;")
- el.appendChild(doc.createTextNode(text))
+ def handleMatch(self, m):
+ el = etree.Element(self.tag)
+ el.text = m.group(3).strip()
return el
class DoubleTagPattern (SimpleTagPattern):
- Return a TextNode nested in tag2 nested in tag1.
+ Return a ElementTree element nested in tag2 nested in tag1.
Usefull for strong emphasis etc.
- def handleMatch(self, m, doc):
+ def handleMatch(self, m):
tag1, tag2 = self.tag.split(",")
- el1 = doc.createElement(tag1)
- el2 = doc.createElement(tag2)
- el1.appendChild(el2)
- el2.appendChild(doc.createTextNode(m.group(2)))
+ el1 = etree.Element(tag1)
+ el2 = etree.SubElement(el1, tag2)
+ el2.text = m.group(3)
return el1
class HtmlPattern (Pattern):
""" Store raw inline html and return a placeholder. """
- def handleMatch (self, m, doc):
+ def handleMatch (self, m):
rawhtml = m.group(2)
inline = True
place_holder = self.stash.store(rawhtml)
- return doc.createTextNode(place_holder)
+ return place_holder
class LinkPattern (Pattern):
- """ Return a NanoDom link Element from the given match. """
- def handleMatch(self, m, doc):
- el = doc.createElement('a')
- el.appendChild(doc.createTextNode(m.group(2)))
- parts = m.group(9).split('"')
- # We should now have [], [href], or [href, title]
- if parts:
- el.setAttribute('href', self.sanatize_url(parts[0].strip()))
+ """ Return a link element from the given match. """
+ def handleMatch(self, m):
+ el = etree.Element("a")
+ el.text = m.group(2)
+ title = m.group(11)
+ href = m.group(9)
+ if href:
+ if href[0] == "<":
+ href = href[1:-1]
+ el.set("href", self.sanatize_url(href.strip()))
- el.setAttribute('href', "")
- if len(parts) > 1:
- # we also got a title
- title = '"' + '"'.join(parts[1:]).strip()
+ el.set("href", "")
+ if title:
title = dequote(title) #.replace('"', "&quot;")
- el.setAttribute('title', title)
+ el.set("title", title)
return el
def sanatize_url(self, url):
@@ -947,30 +698,32 @@ class LinkPattern (Pattern):
return urlunparse(url)
class ImagePattern(LinkPattern):
- """ Return a NanoDom img Element from the given match. """
- def handleMatch(self, m, doc):
- el = doc.createElement('img')
+ """ Return a img element from the given match. """
+ def handleMatch(self, m):
+ el = etree.Element("img")
src_parts = m.group(9).split()
if src_parts:
- el.setAttribute('src', self.sanatize_url(src_parts[0]))
+ src = src_parts[0]
+ if src[0] == "<" and src[-1] == ">":
+ src = src[1:-1]
+ el.set('src', self.sanatize_url(src))
- el.setAttribute('src', "")
+ el.set('src', "")
if len(src_parts) > 1:
- el.setAttribute('title', dequote(" ".join(src_parts[1:])))
+ el.set('title', dequote(" ".join(src_parts[1:])))
- text = doc.createTextNode(m.group(2))
- el.appendChild(text)
- text.handleAttributes()
- truealt = text.value
- el.childNodes.remove(text)
+ truealt = handleAttributes(m.group(2), el)
truealt = m.group(2)
- el.setAttribute('alt', truealt)
+ el.set('alt', truealt)
return el
class ReferencePattern(LinkPattern):
- """ Match to a stored reference and return a NanoDom link Element. """
- def handleMatch(self, m, doc):
+ """ Match to a stored reference and return link element. """
+ def handleMatch(self, m):
if m.group(9):
id = m.group(9).lower()
@@ -982,35 +735,38 @@ class ReferencePattern(LinkPattern):
if not self.references.has_key(id): # ignore undefined refs
return None
href, title = self.references[id]
text = m.group(2)
- return self.makeTag(href, title, text, doc)
+ return self.makeTag(href, title, text)
- def makeTag(self, href, title, text, doc):
- el = doc.createElement('a')
- el.setAttribute('href', self.sanatize_url(href))
+ def makeTag(self, href, title, text):
+ el = etree.Element('a')
+ el.set('href', self.sanatize_url(href))
if title:
- el.setAttribute('title', title)
- el.appendChild(doc.createTextNode(text))
+ el.set('title', title)
+ el.text = text
return el
class ImageReferencePattern (ReferencePattern):
- """ Match to a stored reference and return a NanoDom img Element. """
- def makeTag(self, href, title, text, doc):
- el = doc.createElement('img')
- el.setAttribute('src', self.sanatize_url(href))
+ """ Match to a stored reference and return img element. """
+ def makeTag(self, href, title, text):
+ el = etree.Element("img")
+ el.set("src", self.sanatize_url(href))
if title:
- el.setAttribute('title', title)
- el.setAttribute('alt', text)
+ el.set("title", title)
+ el.set("alt", text)
return el
class AutolinkPattern (Pattern):
""" Return a link Element given an autolink (`<http://example/com>`). """
- def handleMatch(self, m, doc):
- el = doc.createElement('a')
- el.setAttribute('href', m.group(2))
- el.appendChild(doc.createTextNode(m.group(2)))
+ def handleMatch(self, m):
+ el = etree.Element("a")
+ el.set('href', m.group(2))
+ el.text = m.group(2)
return el
class AutomailPattern (Pattern):
@@ -1018,37 +774,35 @@ class AutomailPattern (Pattern):
Return a mailto link Element given an automail link (`<foo@example.com>`).
- def handleMatch(self, m, doc):
- el = doc.createElement('a')
+ def handleMatch(self, m):
+ el = etree.Element('a')
email = m.group(2)
if email.startswith("mailto:"):
email = email[len("mailto:"):]
+ el.text = ""
for letter in email:
- entity = doc.createEntityReference("#%d" % ord(letter))
- el.appendChild(entity)
+ el.text += codepoint2name(ord(letter))
mailto = "mailto:" + email
- mailto = "".join(['&#%d;' % ord(letter) for letter in mailto])
- el.setAttribute('href', mailto)
+ mailto = "".join([AND_SUBSTITUTE + '#%d;' %
+ ord(letter) for letter in mailto])
+ el.set('href', mailto)
return el
STRONG_PATTERN = SimpleTagPattern(STRONG_RE, 'strong')
-STRONG_PATTERN_2 = SimpleTagPattern(STRONG_2_RE, 'strong')
EMPHASIS_PATTERN_2 = SimpleTagPattern(EMPHASIS_2_RE, 'em')
STRONG_EM_PATTERN = DoubleTagPattern(STRONG_EM_RE, 'strong,em')
-STRONG_EM_PATTERN_2 = DoubleTagPattern(STRONG_EM_2_RE, 'strong,em')
LINE_BREAK_PATTERN = SubstituteTagPattern(LINE_BREAK_RE, 'br ')
LINE_BREAK_PATTERN_2 = SubstituteTagPattern(LINE_BREAK_2_RE, 'br ')
@@ -1075,10 +829,10 @@ There are two types of post-processors: Postprocessor and TextPostprocessor
class Postprocessor:
- Postprocessors are run before the dom it converted back into text.
+ Postprocessors are run before the ElementTree serialization.
Each Postprocessor implements a "run" method that takes a pointer to a
- NanoDom document, modifies it as necessary and returns a NanoDom
+ ElementTree, modifies it as necessary and returns a ElementTree
Postprocessors must extend markdown.Postprocessor.
@@ -1088,11 +842,10 @@ class Postprocessor:
- def run(self, dom):
+ def run(self, et):
Subclasses of Postprocessor should implement a `run` method, which
- takes a NanoDOm document and returns a (possably modified) NanoDom
- document.
+ takes a ElementTree and returns a (possably modified) ElementTree.
@@ -1101,7 +854,7 @@ class Postprocessor:
class TextPostprocessor:
- TextPostprocessors are run after the dom it converted back into text.
+ TextPostprocessors are run after the ElementTree it converted back into text.
Each TextPostprocessor implements a "run" method that takes a pointer to a
text string, modifies it as necessary and returns a text string.
@@ -1152,6 +905,20 @@ class RawHtmlTextPostprocessor(TextPostprocessor):
+class AndSubstitutePostprocessor(TextPostprocessor):
+ """ Restore valid entities """
+ def __init__(self):
+ pass
+ def run(self, text):
+ text = text.replace(AND_SUBSTITUTE, "&")
+ return text
========================== MISC AUXILIARY CLASSES ====================
@@ -1280,7 +1047,47 @@ def dequote(string):
return string[1:-1]
return string
+class InlineStash:
+ def __init__(self):
+ self._nodes = {}
+ self.phLength = 4 + len(self.prefix) + len(self.suffix)
+ def _genPlaceholder(self, type):
+ """ Generates placeholder """
+ id = "%04d" % len(self._nodes)
+ hash = "%s%s:%s%s" % (self.prefix, type, id,
+ self.suffix)
+ return hash, id
+ def extractId(self, data, index):
+ """ Extracting id from data string, starting from index """
+ endIndex = data.find(self.suffix, index+1)
+ if endIndex == -1:
+ return None, index + 1
+ else:
+ pair = data[index + len(self.prefix): endIndex].split(":")
+ if len(pair) == 2:
+ return pair[1], endIndex + len(self.suffix)
+ else:
+ return None, index + 1
+ def isin(self, id):
+ return self._nodes.has_key(id)
+ def get(self, id):
+ """ Returns node by id """
+ return self._nodes.get(id)
+ def add(self, node, type):
+ pholder, id = self._genPlaceholder(type)
+ self._nodes[id] = node
+ return pholder
========================== CORE MARKDOWN =============================
@@ -1318,6 +1125,7 @@ class CorePatterns:
self.regExp['containsline'] = re.compile(r'^([-]*)$|^([=]*)$', re.M)
+ self.regExp['attr'] = re.compile("\{@([^\}]*)=([^\}]*)}") # {@id=123}
RE = CorePatterns()
@@ -1353,8 +1161,8 @@ class Markdown:
self.safeMode = safe_mode
self.blockGuru = BlockGuru()
self.registeredExtensions = []
- self.stripTopLevelTags = 1
self.docType = ""
+ self.stripTopLevelTags = True
self.textPreprocessors = [HTML_BLOCK_PREPROCESSOR]
@@ -1370,20 +1178,19 @@ class Markdown:
self.textPostprocessors = [# a footnote postprocessor will get
# inserted here
self.prePatterns = []
- self.inlinePatterns = [DOUBLE_BACKTICK_PATTERN,
+ self.inlinePatterns = [
@@ -1391,13 +1198,15 @@ class Markdown:
# The order of the handlers matters!!!
+ self.inlineStash = InlineStash()
+ self._inlineOperationID = None
self.registerExtensions(extensions = extensions,
configs = extension_configs)
@@ -1440,6 +1249,7 @@ class Markdown:
self.htmlStash = HtmlStash()
+ self.inlineStash = InlineStash()
HTML_BLOCK_PREPROCESSOR.stash = self.htmlStash
LINE_PREPROCESSOR.stash = self.htmlStash
@@ -1461,17 +1271,13 @@ class Markdown:
def _transform(self):
"""Transform the Markdown text into a XHTML body document.
- Returns: A NanoDom Document
+ Returns: ElementTree object
# Setup the document
- self.doc = Document()
- self.top_element = self.doc.createElement("span")
- self.top_element.appendChild(self.doc.createTextNode('\n'))
- self.top_element.setAttribute('class', 'markdown')
- self.doc.appendChild(self.top_element)
+ self.root = etree.Element("span")
# Split into lines and run the preprocessors that will work with
# self.lines
@@ -1482,32 +1288,24 @@ class Markdown:
for prep in self.preprocessors :
self.lines = prep.run(self.lines)
- # Create a NanoDom tree from the lines and attach it to Document
+ # Create a ElementTree from the lines
buffer = []
for line in self.lines:
if line.startswith("#"):
- self._processSection(self.top_element, buffer)
+ self._processSection(self.root, buffer)
buffer = [line]
- self._processSection(self.top_element, buffer)
- #self._processSection(self.top_element, self.lines)
- # Not sure why I put this in but let's leave it for now.
- self.top_element.appendChild(self.doc.createTextNode('\n'))
- # Run the post-processors
- for postprocessor in self.postprocessors:
- postprocessor.run(self.doc)
- return self.doc
+ self._processSection(self.root, buffer)
+ return etree.ElementTree(self.root)
def _processSection(self, parent_elem, lines,
- inList = 0, looseList = 0):
+ inList=0, looseList=0):
Process a section of a source document, looking for high
level structural elements like lists, block quotes, code
@@ -1517,17 +1315,22 @@ class Markdown:
Keyword arguments:
- * parent_elem: A NanoDom element to which the content will be added.
+ * parent_elem: A ElementTree element to which the content will be added.
* lines: a list of lines
* inList: a level
Returns: None
# Loop through lines until none left.
while lines:
+ # Skipping empty line
+ if not lines[0]:
+ lines = lines[1:]
+ continue
# Check if this section starts with a list, a blockquote or
# a code block
@@ -1571,11 +1374,17 @@ class Markdown:
else: # Ok, so it's just a simple block
paragraph, lines = self._linesUntil(lines, lambda line:
- not line.strip())
+ not line.strip() or line[0] == '>')
if len(paragraph) and paragraph[0].startswith('#'):
self._processHeader(parent_elem, paragraph)
+ elif len(paragraph) and \
+ RE.regExp["isline3"].match(paragraph[0]):
+ self._processHR(parent_elem)
+ lines = paragraph[1:] + lines
elif paragraph:
self._processParagraph(parent_elem, paragraph,
inList, looseList)
@@ -1583,48 +1392,59 @@ class Markdown:
if lines and not lines[0].strip():
lines = lines[1:] # skip the first (blank) line
- def _processHeader(self, parent_elem, paragraph):
+ def _processHR(self, parentElem):
+ hr = etree.SubElement(parentElem, "hr")
+ def _processHeader(self, parentElem, paragraph):
m = RE.regExp['header'].match(paragraph[0])
if m:
level = len(m.group(1))
- h = self.doc.createElement("h%d" % level)
- parent_elem.appendChild(h)
- for item in self._handleInline(m.group(2).strip()):
- h.appendChild(item)
+ h = etree.SubElement(parentElem, "h%d" % level)
+ inline = etree.SubElement(h, "inline")
+ inline.text = m.group(2).strip()
message(CRITICAL, "We've got a problem header!")
- def _processParagraph(self, parent_elem, paragraph, inList, looseList):
- list = self._handleInline("\n".join(paragraph))
+ def _processParagraph(self, parentElem, paragraph, inList, looseList):
- if ( parent_elem.nodeName == 'li'
- and not (looseList or parent_elem.childNodes)):
+ if ( parentElem.tag == 'li'
+ and not (looseList or parentElem.getchildren())):
# If this is the first paragraph inside "li", don't
# put <p> around it - append the paragraph bits directly
- # onto parent_elem
- el = parent_elem
+ # onto parentElem
+ el = parentElem
# Otherwise make a "p" element
- el = self.doc.createElement("p")
- parent_elem.appendChild(el)
+ el = etree.SubElement(parentElem, "p")
- for item in list:
- el.appendChild(item)
- def _processUList(self, parent_elem, lines, inList):
- self._processList(parent_elem, lines, inList,
+ dump = []
+ # Searching for hr
+ for line in paragraph:
+ if RE.regExp["isline3"].match(line):
+ inline = etree.SubElement(el, "inline")
+ inline.text = "\n".join(dump)
+ etree.SubElement(el, "hr")
+ dump.clear()
+ else:
+ dump.append(line)
+ if dump:
+ text = "\n".join(dump)
+ inline = etree.SubElement(el, "inline")
+ inline.text = text
+ def _processUList(self, parentElem, lines, inList):
+ self._processList(parentElem, lines, inList,
listexpr='ul', tag = 'ul')
- def _processOList(self, parent_elem, lines, inList):
- self._processList(parent_elem, lines, inList,
+ def _processOList(self, parentElem, lines, inList):
+ self._processList(parentElem, lines, inList,
listexpr='ol', tag = 'ol')
- def _processList(self, parent_elem, lines, inList, listexpr, tag):
+ def _processList(self, parentElem, lines, inList, listexpr, tag):
Given a list of document lines starting with a list item,
finds the end of the list, breaks it up, and recursively
@@ -1632,7 +1452,7 @@ class Markdown:
Keyword arguments:
- * parent_elem: A dom element to which the content will be added
+ * parentElem: A ElementTree element to which the content will be added
* lines: a list of lines
* inList: a level
@@ -1640,8 +1460,7 @@ class Markdown:
- ul = self.doc.createElement(tag) # ul might actually be '<ol>'
- parent_elem.appendChild(ul)
+ ul = etree.SubElement(parentElem, tag) # ul might actually be '<ol>'
looseList = 0
@@ -1669,6 +1488,7 @@ class Markdown:
# Check if the next non-blank line is still a part of the list
if ( RE.regExp[listexpr].match(next) or
RE.regExp['tabbed'].match(next) ):
# get rid of any white space in the line
@@ -1702,16 +1522,15 @@ class Markdown:
i += 1
- # Add the dom elements
+ # Add the ElementTree elements
for item in items:
- li = self.doc.createElement("li")
- ul.appendChild(li)
+ li = etree.SubElement(ul, "li")
self._processSection(li, item, inList + 1, looseList = looseList)
# Process the remaining part of the section
- self._processSection(parent_elem, lines[i:], inList)
+ self._processSection(parentElem, lines[i:], inList)
def _linesUntil(self, lines, condition):
@@ -1725,12 +1544,13 @@ class Markdown:
i = -1
for line in lines:
i += 1
- if condition(line): break
+ if condition(line):
+ break
i += 1
return lines[:i], lines[i:]
- def _processQuote(self, parent_elem, lines, inList):
+ def _processQuote(self, parentElem, lines, inList):
Given a list of document lines starting with a quote finds
the end of the quote, unindents it and recursively
@@ -1739,7 +1559,7 @@ class Markdown:
Keyword arguments:
- * parent_elem: DOM element to which the content will be added
+ * parentElem: ElementTree element to which the content will be added
* lines: a list of lines
* inList: a level
@@ -1766,25 +1586,24 @@ class Markdown:
- blockquote = self.doc.createElement('blockquote')
- parent_elem.appendChild(blockquote)
+ blockquote = etree.SubElement(parentElem, "blockquote")
self._processSection(blockquote, dequoted, inList)
- self._processSection(parent_elem, lines[i:], inList)
+ self._processSection(parentElem, lines[i:], inList)
- def _processCodeBlock(self, parent_elem, lines, inList):
+ def _processCodeBlock(self, parentElem, lines, inList):
Given a list of document lines starting with a code block
- finds the end of the block, puts it into the dom verbatim
+ finds the end of the block, puts it into the ElementTree verbatim
wrapped in ("<pre><code>") and recursively processes the
the remainder of the text file.
Keyword arguments:
- * parent_elem: DOM element to which the content will be added
+ * parentElem: ElementTree element to which the content will be added
* lines: a list of lines
* inList: a level
@@ -1794,163 +1613,264 @@ class Markdown:
detabbed, theRest = self.blockGuru.detectTabbed(lines)
- pre = self.doc.createElement('pre')
- code = self.doc.createElement('code')
- parent_elem.appendChild(pre)
- pre.appendChild(code)
+ pre = etree.SubElement(parentElem, "pre")
+ code = etree.SubElement(pre, "code")
text = "\n".join(detabbed).rstrip()+"\n"
- #text = text.replace("&", "&amp;")
- code.appendChild(self.doc.createTextNode(text))
- self._processSection(parent_elem, theRest, inList)
- def _handleInline (self, line, patternIndex=0):
+ code.text = text
+ self._processSection(parentElem, theRest, inList)
+ def _handleInline(self, data, patternIndex=0):
- Transform a Markdown line with inline elements to an XHTML
- fragment.
- This function uses auxiliary objects called inline patterns.
- See notes on inline patterns above.
+ Processinf string with inline patterns and replasing it
+ with placeholders
Keyword arguments:
- * line: A line of Markdown text
+ * data: A line of Markdown text
* patternIndex: The index of the inlinePattern to start with
- Return: A list of NanoDom nodes
+ Return: String with placeholders.
- parts = [line]
- while patternIndex < len(self.inlinePatterns):
- i = 0
- while i < len(parts):
- x = parts[i]
- if isinstance(x, (str, unicode)):
- result = self._applyPattern(x, \
- self.inlinePatterns[patternIndex], \
- patternIndex)
- if result:
- i -= 1
- parts.remove(x)
- for y in result:
- parts.insert(i+1,y)
- i += 1
- patternIndex += 1
- for i in range(len(parts)):
- x = parts[i]
- if isinstance(x, (str, unicode)):
- parts[i] = self.doc.createTextNode(x)
- return parts
+ startIndex = 0
- def _applyPattern(self, line, pattern, patternIndex):
+ while patternIndex < len(self.inlinePatterns):
+ data, matched, startIndex = self._applyInline(
+ self.inlinePatterns[patternIndex],
+ data, patternIndex, startIndex)
+ if not matched:
+ patternIndex += 1
+ return data
+ def _applyInline(self, pattern, data, patternIndex, startIndex=0):
Given a pattern name, this function checks if the line
- fits the pattern, creates the necessary elements, and returns
- back a list consisting of NanoDom elements and/or strings.
+ fits the pattern, creates the necessary elements, adds it
+ to InlineStash, and returns string with placeholders,
+ instead of ElementTree elements.
Keyword arguments:
- * line: the text to be processed
+ * data: the text to be processed
* pattern: the pattern to be checked
+ * patternIndex: index of current pattern
+ * startIndex: string index, from wich we starting search
- Returns: The appropriate newly created NanoDom element if the
- pattern matches, None otherwise.
+ Returns: String with placeholders.
+ match = pattern.getCompiledRegExp().match(data[startIndex:])
+ leftData = data[:startIndex]
+ if not match:
+ return data, False, 0
- # match the line to pattern's pre-compiled reg exp.
- # if no match, move on.
- m = pattern.getCompiledRegExp().match(line)
- if not m:
- return None
- # if we got a match let the pattern make us a NanoDom node
- # if it doesn't, move on
- node = pattern.handleMatch(m, self.doc)
- # check if any of this nodes have children that need processing
- if isinstance(node, Element):
- if not node.nodeName in ["code", "pre"]:
- for child in node.childNodes:
- if isinstance(child, TextNode):
- result = self._handleInline(child.value, patternIndex+1)
- if result:
- if result == [child]:
- continue
- result.reverse()
- #to make insertion easier
+ node = pattern.handleMatch(match)
+ if node is None:
+ return data, True, len(leftData) + match.span(len(match.groups()))[0]
+ if not isstr(node):
+ if not node.tag in ["code", "pre"]:
+ # We need to process current node too
+ for child in [node] + node.getchildren():
+ if not isstr(node):
+ if child.text:
+ child.text = self._handleInline(child.text,
+ patternIndex + 1)
+ if child.tail:
+ child.tail = self._handleInline(child.tail,
+ patternIndex)
+ pholder = self.inlineStash.add(node, pattern.type())
+ return "%s%s%s%s" % (leftData,
+ match.group(1),
+ pholder, match.groups()[-1]), True, 0
+ def _processElementText(self, node, subnode, isText=True):
+ if isText:
+ text = subnode.text
+ subnode.text = None
+ else:
+ text = subnode.tail
+ subnode.tail = None
+ childResult = self._processPlaceholders(text, subnode)
+ if not isText and node is not subnode:
+ pos = node.getchildren().index(subnode)
+ node.remove(subnode)
+ else:
+ pos = 0
+ childResult.reverse()
+ for newChild in childResult:
+ node.insert(pos, newChild)
+ def _processPlaceholders(self, data, parent):
+ """
+ Processes string with placeholders and generates ElementTree tree.
+ * data: string with placeholders instead of ElementTree elements.
- position = node.childNodes.index(child)
+ Returns: list with ElementTree elements with applied inline patterns.
+ """
+ def linkText(text):
+ if text:
+ if result:
+ if result[-1].tail:
+ result[-1].tail += text
+ else:
+ result[-1].tail = text
+ else:
+ if parent.text:
+ parent.text += text
+ else:
+ parent.text = text
+ result = []
+ prefix = self.inlineStash.prefix
+ strartIndex = 0
+ while data:
+ index = data.find(prefix, strartIndex)
+ if index != -1:
+ id, phEndIndex = self.inlineStash.extractId(data, index)
+ if self.inlineStash.isin(id):
+ node = self.inlineStash.get(id)
+ if index > 0:
+ text = data[strartIndex:index]
+ linkText(text)
+ if not isstr(node): # it's Element
+ for child in [node] + node.getchildren():
+ if child.tail:
+ self._processElementText(node, child, False)
- node.removeChild(child)
- for item in result:
- if isinstance(item, (str, unicode)):
- if len(item) > 0:
- node.insertChild(position,
- self.doc.createTextNode(item))
- else:
- node.insertChild(position, item)
+ if child.text:
+ self._processElementText(child, child)
+ else: # it's just a string
+ linkText(node)
+ strartIndex = phEndIndex
+ continue
+ strartIndex = phEndIndex
+ result.append(node)
+ else: # wrong placeholder
+ end = index + len(prefix)
+ linkText(data[strartIndex:end])
+ strartIndex = end
+ else:
+ text = data[strartIndex:]
+ linkText(text)
+ data = ""
+ return result
+ def _processTree(self, el):
+ """
+ Processing ElementTree, and applying inline patterns
+ Keyword arguments:
+ * el - parent element of ElementTree.
+ Returns: ElementTree object with applied inline patterns.
+ """
- if node:
- # Those are in the reverse order!
- return ( m.groups()[-1], # the string to the left
- node, # the new node
- m.group(1)) # the string to the right of the match
- else:
- return None
+ stack = [el]
+ while stack:
+ currElement = stack.pop()
+ insertQueue = []
+ for child in currElement.getchildren():
+ if child.tag == "inline":
+ lst = self._processPlaceholders(self._handleInline(
+ child.text), currElement)
+ pos = currElement.getchildren().index(child)
+ insertQueue.append((child, pos, lst))
+ else:
+ stack.append(child)
+ for element, pos, lst in insertQueue:
+ currElement.remove(element)
+ if currElement.text:
+ currElement.text = handleAttributes(currElement.text,
+ currElement)
+ for newChild in lst:
+ # Processing attributes
+ if newChild.tail:
+ newChild.tail = handleAttributes(newChild.tail,
+ currElement)
+ if newChild.text:
+ newChild.text = handleAttributes(newChild.text,
+ newChild)
+ currElement.insert(pos, newChild)
+ pos += 1
+ def applyInlinePatterns(self, markdownTree):
+ """
+ Retrun ElementTree, with applied
+ inline paterns
+ Keyword arguments:
+ * markdownTree: ElementTree object, reppresenting Markdown tree.
- def convert (self, source=None):
+ Returns: ElementTree object.
- Return the document in XHTML format.
+ el = markdownTree.getroot()
+ self._processTree(el)
+ return markdownTree
+ def markdownToTree(self, source=None):
+ """
+ Retrun ElementTree, without applying inline paterns,
+ all data, that should be processed with
+ inline patterns included in <inline></inline> sections.
Keyword arguments:
* source: An ascii or unicode string of Markdown formated text.
- Returns: A serialized XHTML body.
+ Returns: ElementTree object.
if source is not None: #Allow blank string
self.source = source
if not self.source:
return u""
self.source = unicode(self.source)
except UnicodeDecodeError:
message(CRITICAL, 'UnicodeDecodeError: Markdown only accepts unicode or ascii input.')
return u""
# Fixup the source text
self.source = self.source.replace(START, "")
@@ -1962,20 +1882,48 @@ class Markdown:
for pp in self.textPreprocessors:
self.source = pp.run(self.source)
+ markdownTree = self._transform()
+ return markdownTree
- doc = self._transform()
- xml = doc.toxml()
+ def convert (self, source=None):
+ """
+ Return the document in XHTML format.
+ Keyword arguments:
+ * source: An ascii or unicode string of Markdown formated text.
- # Return everything but the top level tag
+ Returns: A serialized XHTML body.
+ """
+ tree = self.markdownToTree(source)
+ root = self.applyInlinePatterns(tree).getroot()
+ # Run the post-processors
+ for postprocessor in self.postprocessors:
+ postprocessor.stash = self.htmlStash
+ newRoot = postprocessor.run(root)
+ if newRoot:
+ root = newRoot
+ indentETree(root)
+ xml = codecs.decode(etree.tostring(root, encoding="utf8"), "utf8")
if self.stripTopLevelTags:
- xml = xml.strip()[23:-7] + "\n"
+ xml = xml.strip()[44:-7] + "\n"
+ # Run the text post-processors
for pp in self.textPostprocessors:
xml = pp.run(xml)
- return (self.docType + xml).strip()
+ return xml.strip()
def __str__(self):