diff options
Diffstat (limited to 'markdown.py')
-rwxr-xr-x[-rw-r--r--] | markdown.py | 1208 |
1 files changed, 578 insertions, 630 deletions
diff --git a/markdown.py b/markdown.py index 86f83ab..059ac87 100644..100755 --- a/markdown.py +++ b/markdown.py @@ -33,13 +33,15 @@ __revision__ = "$Rev$" -import re, sys, codecs +import re, sys, codecs, htmlentitydefs from urlparse import urlparse, urlunparse from logging import getLogger, StreamHandler, Formatter, \ DEBUG, INFO, WARN, ERROR, CRITICAL + + MESSAGE_THRESHOLD = CRITICAL @@ -56,13 +58,59 @@ logger.addHandler(console_hndlr) def message(level, text): ''' A wrapper method for logging debug messages. ''' logger.log(level, text) - + +def isstr(s): + return isinstance(s, unicode) or isinstance(s, str) + +def importETree(): + """ Imports best variant of ElementTree + and returns module object """ + + try: + # Python 2.5+ + import xml.etree.cElementTree as etree + except ImportError: + try: + # Python 2.5+ + import xml.etree.ElementTree as etree + except ImportError: + try: + # normal cElementTree install + import cElementTree as etree + except ImportError: + try: + # normal ElementTree install + import elementtree.ElementTree as etree + except ImportError: + message(CRITICAL, + "Failed to import ElementTree from any known place") + sys.exit(1) + return etree + +etree = importETree() + +def indentETree(elem, level=0): + + if level > 1: + i = "\n" + (level-1)*" " + else: + i = "\n" + + if len(elem): + if not elem.text or not elem.text.strip(): + elem.text = i + " " + for e in elem: + indentETree(e, level+1) + if not e.tail or not e.tail.strip(): + e.tail = i + if level and (not elem.tail or not elem.tail.strip()): + elem.tail = i # --------------- CONSTANTS YOU MIGHT WANT TO MODIFY ----------------- TAB_LENGTH = 4 # expand tabs to this many spaces ENABLE_ATTRIBUTES = True # @id = xyz -> <... id="xyz"> -SMART_EMPHASIS = 1 # this_or_that does not become this<i>or</i>that +SMART_EMPHASIS = True # this_or_that does not become this<i>or</i>that HTML_REMOVED_TEXT = "[HTML_REMOVED]" # text used instead of HTML in safe mode RTL_BIDI_RANGES = ( (u'\u0590', u'\u07FF'), @@ -107,6 +155,11 @@ EXECUTABLE_NAME_FOR_USAGE = "python markdown.py" # --------------- CONSTANTS YOU _SHOULD NOT_ HAVE TO CHANGE ---------- +AND_SUBSTITUTE = unichr(2) + unichr(4) + unichr(3) + +INLINE_PLACEHOLDER_PREFIX = u'\u0001' +INLINE_PLACEHOLDER_SUFFIX = u'\u0002' + # a template for html placeholders START = u'\u0001' END = u'\u0002' @@ -126,330 +179,23 @@ def isBlockLevel (tag): return ( (tag in BLOCK_LEVEL_ELEMENTS) or (tag[0] == 'h' and tag[1] in "0123456789") ) -""" -====================================================================== -========================== NANODOM =================================== -====================================================================== - -The three classes below implement some of the most basic DOM -methods. I use this instead of minidom because I need a simpler -functionality and do not want to require additional libraries. - -Importantly, NanoDom does not do normalization, which is what we -want. It also adds extra white space when converting DOM to string -""" - -ENTITY_NORMALIZATION_EXPRESSIONS = [ (re.compile("&"), "&"), - (re.compile("<"), "<"), - (re.compile(">"), ">")] - -ENTITY_NORMALIZATION_EXPRESSIONS_SOFT = [ (re.compile("&(?!\#)"), "&"), - (re.compile("<"), "<"), - (re.compile(">"), ">"), - (re.compile("\""), """)] - - -def getBidiType(text): - """ - Get Bi-directional text type. Used by TextNode to determine text direction. - """ - - if not text: return None - - ch = text[0] - - if not isinstance(ch, unicode) or not ch.isalpha(): - return None +def codepoint2name(code): + """ Returns entity definition by code, or code + if there is no such entity definition""" + entity = htmlentitydefs.codepoint2name.get(code) + if entity: + return "%s%s;" % (AND_SUBSTITUTE, entity) else: - - for min, max in RTL_BIDI_RANGES: - if ( ch >= min and ch <= max ): - return "rtl" - else: - return "ltr" - - -class Document: - """ - Document root of the NanoDom. An instance stores DOM elements as children. - - """ - - def __init__ (self): - """ Create a NanoDom document. """ - self.bidi = "ltr" - - def appendChild(self, child): - """ Add a dom element as a child of the document root. """ - self.documentElement = child - child.isDocumentElement = True - child.parent = self - self.entities = {} - - def setBidi(self, bidi): - """ Set text direction (right-left or left-right).""" - if bidi: - self.bidi = bidi - - def createElement(self, tag, textNode=None): - """ Given a tag or textNode, return a dom element. """ - el = Element(tag) - el.doc = self - if textNode: - el.appendChild(self.createTextNode(textNode)) - return el - - def createTextNode(self, text): - """ Return given text as a TextNode. """ - node = TextNode(text) - node.doc = self - return node - - def createEntityReference(self, entity): - """ Return an html entitry reference (i.e.: `&`). """ - if entity not in self.entities: - self.entities[entity] = EntityReference(entity) - return self.entities[entity] - - def createCDATA(self, text): - """ Return the given text as a CDATA node. """ - node = CDATA(text) - node.doc = self - return node - - def toxml (self): - """ Convert document to xml and return a string. """ - return self.documentElement.toxml() - - def normalizeEntities(self, text, avoidDoubleNormalizing=False): - """ Return the given text as an html entity (i.e.: `<` => `>`). """ - if avoidDoubleNormalizing: - regexps = ENTITY_NORMALIZATION_EXPRESSIONS_SOFT - else: - regexps = ENTITY_NORMALIZATION_EXPRESSIONS - - for regexp, substitution in regexps: - text = regexp.sub(substitution, text) - return text - - def find(self, test): - """ Return a list of descendants that pass the test function """ - return self.documentElement.find(test) - - def unlink(self): - """ Cleanup: Remove all children from the document. """ - self.documentElement.unlink() - self.documentElement = None - - -class CDATA: - """ CDATA node type of NanoDom. """ - type = "cdata" - - def __init__ (self, text): - """ Create a CDATA node with given text. """ - self.text = text - - def handleAttributes(self): - """ Not implemented for CDATA node type. """ - pass - - def toxml (self): - """ Return CDATA node as a string. """ - return "<![CDATA[" + self.text + "]]>" - -class Element: - """ - Element node type of Nanodom. + return "%s#%d;" % (AND_SUBSTITUTE, code) - All html tags would most likely be represented as Elements. - - """ - type = "element" - - def __init__ (self, tag): - """ Create an Element node instance. """ - self.nodeName = tag - self.attributes = [] - self.attribute_values = {} - self.childNodes = [] - self.bidi = None - self.isDocumentElement = False - - def setBidi(self, bidi): - """ Set text direction (i.e.: right-left or left-right). """ - if bidi: - - orig_bidi = self.bidi - - if not self.bidi or self.isDocumentElement: - # Once the bidi is set don't change it (except for doc element) - self.bidi = bidi - self.parent.setBidi(bidi) - - - def unlink(self): - """ Cleanup: Remove all children of the Element. """ - for child in self.childNodes: - if child.type == "element": - child.unlink() - self.childNodes = None - - def setAttribute(self, attr, value): - """ - Assign an html/xml attribute to the Element (i.e.: id, class, href). - """ - if not attr in self.attributes: - self.attributes.append(attr) - - self.attribute_values[attr] = value - - def insertChild(self, position, child): - """ Insert a child Element at the given position. """ - self.childNodes.insert(position, child) - child.parent = self - - def removeChild(self, child): - """ Remove the given child from the Element. """ - self.childNodes.remove(child) - - def replaceChild(self, oldChild, newChild): - """ Replace an old child Element with a new child Element. """ - position = self.childNodes.index(oldChild) - self.removeChild(oldChild) - self.insertChild(position, newChild) - - def appendChild(self, child): - """ Append a new child Element to the end of the child Elements. """ - self.childNodes.append(child) - child.parent = self - - def handleAttributes(self): - """ Not implemented for Element node type. """ - pass - - def find(self, test, depth=0): - """ Returns a list of descendants that pass the test function """ - matched_nodes = [] - for child in self.childNodes: - if test(child): - matched_nodes.append(child) - if child.type == "element": - matched_nodes += child.find(test, depth+1) - return matched_nodes - - def toxml(self): - """ Return the Element and all children as a string. """ - if ENABLE_ATTRIBUTES: - for child in self.childNodes: - child.handleAttributes() - - buffer = "" - if self.nodeName in ['h1', 'h2', 'h3', 'h4']: - buffer += "\n" - elif self.nodeName in ['li']: - buffer += "\n " - - # Process children FIRST, then do the attributes - - childBuffer = "" - - if self.childNodes or self.nodeName in ['blockquote']: - childBuffer += ">" - for child in self.childNodes: - childBuffer += child.toxml() - if self.nodeName == 'p': - childBuffer += "\n" - elif self.nodeName == 'li': - childBuffer += "\n " - childBuffer += "</%s>" % self.nodeName - else: - childBuffer += "/>" - - - - buffer += "<" + self.nodeName - - if self.nodeName in ['p', 'li', 'ul', 'ol', - 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']: - - if not self.attribute_values.has_key("dir"): - if self.bidi: - bidi = self.bidi - else: - bidi = self.doc.bidi - - if bidi=="rtl": - self.setAttribute("dir", "rtl") - - for attr in self.attributes: - value = self.attribute_values[attr] - value = self.doc.normalizeEntities(value, - avoidDoubleNormalizing=True) - buffer += ' %s="%s"' % (attr, value) - - - # Now let's actually append the children - - buffer += childBuffer - - if self.nodeName in ['p', 'br ', 'li', 'ul', 'ol', - 'h1', 'h2', 'h3', 'h4'] : - buffer += "\n" - - return buffer - - -class TextNode: - """ A Text node type of the NanoDom. """ - type = "text" - attrRegExp = re.compile(r'\{@([^\}]*)=([^\}]*)}') # {@id=123} - - def __init__ (self, text): - """ Create a TextNode with the given text. """ - self.value = text - - def attributeCallback(self, match): - """ Regex callback method to set attribute on parent. """ - self.parent.setAttribute(match.group(1), match.group(2)) - - def handleAttributes(self): - """ Parse and assign attributes to the parent Element. """ - self.value = self.attrRegExp.sub(self.attributeCallback, self.value) - - def toxml(self): - """ Return the TextNode as a string. """ - text = self.value - - self.parent.setBidi(getBidiType(text)) +def handleAttributes(text, parent): - if not text.startswith(HTML_PLACEHOLDER_PREFIX): - if self.parent.nodeName == "p": - text = text.replace("\n", "\n ") - elif (self.parent.nodeName == "li" - and self.parent.childNodes[0]==self): - text = "\n " + text.replace("\n", "\n ") - text = self.doc.normalizeEntities(text) - return text - - -class EntityReference: - """ EntityReference node type of NanoDom. """ - type = "entity_ref" - - def __init__(self, entity): - """ Create an EntityReference of the given entity. """ - self.entity = entity - - def handleAttributes(self): - """ Not implemented for EntityReference. """ - pass - - def toxml(self): - """ Return the EntityReference as a string. """ - return "&" + self.entity + ";" + def attributeCallback(match): + parent.set(match.group(1), match.group(2)) + return RE.regExp['attr'].sub(attributeCallback, text) + """ ====================================================================== @@ -613,7 +359,7 @@ class HeaderPreprocessor(Preprocessor): """ Replace underlined headers with hashed headers to avoid - the nead for lookahead later. + the need for lookahead later. """ def run (self, lines): @@ -656,29 +402,32 @@ class LinePreprocessor(Preprocessor): blockquote_re = re.compile(r'^(> )+') def run (self, lines): - """ Find a store HR lines. """ + """ Find and replace HR lines. """ for i in range(len(lines)): prefix = '' m = self.blockquote_re.search(lines[i]) - if m : prefix = m.group(0) + if m: + prefix = m.group(0) if self._isLine(lines[i][len(prefix):]): - lines[i] = prefix + self.stash.store("<hr />", safe=True) + #lines[i] = prefix + self.stash.store("<hr />", safe=True) + lines[i] = prefix + "___" return lines def _isLine(self, block): """Determine if a block should be replaced with an <HR>""" - if block.startswith(" "): return 0 # a code block + if block.startswith(" "): + return False # a code block text = "".join([x for x in block if not x.isspace()]) if len(text) <= 2: - return 0 + return False for pattern in ['isline1', 'isline2', 'isline3']: m = RE.regExp[pattern].match(text) if (m and m.group(1)): - return 1 + return True else: - return 0 + return False LINE_PREPROCESSOR = LinePreprocessor() @@ -713,6 +462,7 @@ class ReferencePreprocessor(Preprocessor): REFERENCE_PREPROCESSOR = ReferencePreprocessor() + """ ====================================================================== ========================== INLINE PATTERNS =========================== @@ -725,9 +475,8 @@ expression and needs support the following methods: pattern.getCompiledRegExp() - returns a regular expression - pattern.handleMatch(m, doc) - takes a match object and returns - a NanoDom node (as a part of the provided - doc) or None + pattern.handleMatch(m) - takes a match object and returns + a ElementTree element or just plain text All of python markdown's built-in patterns subclass from Pattern, but you can add additional patterns that don't. @@ -765,32 +514,31 @@ BRK = ( r'\[(' + NOBRACKET + r')\]' ) NOIMG = r'(?<!\!)' -BACKTICK_RE = r'\`([^\`]*)\`' # `e= m*c^2` -DOUBLE_BACKTICK_RE = r'\`\`(.*?)\`\`' # ``e=f("`")`` +BACKTICK_RE = r'(?<!\\)(`+)(.+?)(?<!`)\2(?!`)' # `e=f()` or ``e=f("`")`` ESCAPE_RE = r'\\(.)' # \< -EMPHASIS_RE = r'\*([^\*]*)\*' # *emphasis* -STRONG_RE = r'\*\*(.*?)\*\*' # **strong** -STRONG_EM_RE = r'\*\*\*(.*?)\*\*\*' # ***strong*** +EMPHASIS_RE = r'(\*)([^\*]*)\2' # *emphasis* +STRONG_RE = r'(\*{2}|_{2})(.*?)\2' # **strong** +STRONG_EM_RE = r'(\*{3}|_{3})(.*?)\2' # ***strong*** if SMART_EMPHASIS: - EMPHASIS_2_RE = r'(?<!\S)_(\S[^_]*)_' # _emphasis_ + EMPHASIS_2_RE = r'(?<!\S)(_)(\S.*?)\2' # _emphasis_ else: - EMPHASIS_2_RE = r'_([^_]*)_' # _emphasis_ + EMPHASIS_2_RE = r'(_)(.*?)\2' # _emphasis_ -STRONG_2_RE = r'__(.*?)__' # __strong__ -STRONG_EM_2_RE = r'___(.*?)___' # ___strong___ +#LINK_RE = NOIMG + BRK + r'\s*\(([^\)]*)\)' # [text](url) -LINK_RE = NOIMG + BRK + r'\s*\(([^\)]*)\)' # [text](url) -LINK_ANGLED_RE = NOIMG + BRK + r'\s*\(<([^\)]*)>\)' # [text](<url>) -IMAGE_LINK_RE = r'\!' + BRK + r'\s*\(([^\)]*)\)' # ![alttxt](http://x.com/) +LINK_RE = NOIMG + BRK + \ +r'''\(\s*(<.*?>|((?:(?:\(.*?\))|[^\(\)]))*?)\s*((['"])(.*)\12)?\)''' # [text](url) or [text](<url>) + +IMAGE_LINK_RE = r'\!' + BRK + r'\s*\((<.*?>|([^\)]*))\)' # ![alttxt](http://x.com/) or ![alttxt](<http://x.com/>) REFERENCE_RE = NOIMG + BRK+ r'\s*\[([^\]]*)\]' # [Google][3] IMAGE_REFERENCE_RE = r'\!' + BRK + '\s*\[([^\]]*)\]' # ![alt text][2] NOT_STRONG_RE = r'( \* )' # stand-alone * or _ AUTOLINK_RE = r'<((?:f|ht)tps?://[^>]*)>' # <http://www.123.com> AUTOMAIL_RE = r'<([^> \!]*@[^> ]*)>' # <me@example.com> #HTML_RE = r'(\<[^\>]*\>)' # <...> -HTML_RE = r'(\<[a-zA-Z/][^\>]*\>)' # <...> -ENTITY_RE = r'(&[\#a-zA-Z0-9]*;)' # & +HTML_RE = r'(\<([a-zA-Z/][^\>]*?|\!--.*?--)\>)' # <...> +ENTITY_RE = r'(&[\#a-zA-Z0-9]*;)' # & LINE_BREAK_RE = r' \n' # two spaces at end of line LINE_BREAK_2_RE = r' $' # two spaces at end of text @@ -816,30 +564,35 @@ class Pattern: """ Return a compiled regular expression. """ return self.compiled_re - def handleMatch(self, m, doc): + def handleMatch(self, m): """ - Return a NanoDom element from the given match. Subclasses should + Return a ElementTree element from the given match. Subclasses should override this method. Keyword arguments: * m: A re match object containing a match of the pattern. - * doc: An instance of a NanoDom Document. - """ pass + + def type(self): + """ Return class name, to define pattern type """ + return self.__class__.__name__ BasePattern = Pattern # for backward compatibility class SimpleTextPattern (Pattern): - """ Return a simple TextNode of group(2) of a Pattern. """ - def handleMatch(self, m, doc): - return doc.createTextNode(m.group(2)) + """ Return a simple text of group(2) of a Pattern. """ + def handleMatch(self, m): + text = m.group(2) + if text == INLINE_PLACEHOLDER_PREFIX: + return None + return text class SimpleTagPattern (Pattern): """ - Return NanoDom Element of type `tag` with a child TextNode of group(2) + Return element of type `tag` with a text attribute of group(3) of a Pattern. """ @@ -847,71 +600,69 @@ class SimpleTagPattern (Pattern): Pattern.__init__(self, pattern) self.tag = tag - def handleMatch(self, m, doc): - el = doc.createElement(self.tag) - el.appendChild(doc.createTextNode(m.group(2))) + def handleMatch(self, m): + el = etree.Element(self.tag) + el.text = m.group(3) return el class SubstituteTagPattern (SimpleTagPattern): - """ Return a NanoDom ELement of type `tag` with no children. """ - def handleMatch (self, m, doc): - return doc.createElement(self.tag) + """ Return a eLement of type `tag` with no children. """ + def handleMatch (self, m): + return etree.Element(self.tag) class BacktickPattern (Pattern): - """ Return a NanoDom `<code>` Element containing the matching text. """ + """ Return a `<code>` element containing the matching text. """ def __init__ (self, pattern): Pattern.__init__(self, pattern) self.tag = "code" - def handleMatch(self, m, doc): - el = doc.createElement(self.tag) - text = m.group(2).strip() - #text = text.replace("&", "&") - el.appendChild(doc.createTextNode(text)) + def handleMatch(self, m): + el = etree.Element(self.tag) + el.text = m.group(3).strip() return el class DoubleTagPattern (SimpleTagPattern): """ - Return a TextNode nested in tag2 nested in tag1. + Return a ElementTree element nested in tag2 nested in tag1. Usefull for strong emphasis etc. """ - def handleMatch(self, m, doc): + def handleMatch(self, m): tag1, tag2 = self.tag.split(",") - el1 = doc.createElement(tag1) - el2 = doc.createElement(tag2) - el1.appendChild(el2) - el2.appendChild(doc.createTextNode(m.group(2))) + el1 = etree.Element(tag1) + el2 = etree.SubElement(el1, tag2) + el2.text = m.group(3) return el1 class HtmlPattern (Pattern): """ Store raw inline html and return a placeholder. """ - def handleMatch (self, m, doc): + def handleMatch (self, m): rawhtml = m.group(2) inline = True place_holder = self.stash.store(rawhtml) - return doc.createTextNode(place_holder) + return place_holder class LinkPattern (Pattern): - """ Return a NanoDom link Element from the given match. """ - def handleMatch(self, m, doc): - el = doc.createElement('a') - el.appendChild(doc.createTextNode(m.group(2))) - parts = m.group(9).split('"') - # We should now have [], [href], or [href, title] - if parts: - el.setAttribute('href', self.sanatize_url(parts[0].strip())) + """ Return a link element from the given match. """ + def handleMatch(self, m): + el = etree.Element("a") + el.text = m.group(2) + title = m.group(11) + href = m.group(9) + if href: + if href[0] == "<": + href = href[1:-1] + el.set("href", self.sanatize_url(href.strip())) else: - el.setAttribute('href', "") - if len(parts) > 1: - # we also got a title - title = '"' + '"'.join(parts[1:]).strip() + el.set("href", "") + + if title: title = dequote(title) #.replace('"', """) - el.setAttribute('title', title) + el.set("title", title) return el def sanatize_url(self, url): @@ -947,30 +698,32 @@ class LinkPattern (Pattern): return urlunparse(url) class ImagePattern(LinkPattern): - """ Return a NanoDom img Element from the given match. """ - def handleMatch(self, m, doc): - el = doc.createElement('img') + """ Return a img element from the given match. """ + + def handleMatch(self, m): + el = etree.Element("img") src_parts = m.group(9).split() if src_parts: - el.setAttribute('src', self.sanatize_url(src_parts[0])) + src = src_parts[0] + if src[0] == "<" and src[-1] == ">": + src = src[1:-1] + el.set('src', self.sanatize_url(src)) else: - el.setAttribute('src', "") + el.set('src', "") if len(src_parts) > 1: - el.setAttribute('title', dequote(" ".join(src_parts[1:]))) + el.set('title', dequote(" ".join(src_parts[1:]))) + if ENABLE_ATTRIBUTES: - text = doc.createTextNode(m.group(2)) - el.appendChild(text) - text.handleAttributes() - truealt = text.value - el.childNodes.remove(text) + truealt = handleAttributes(m.group(2), el) else: truealt = m.group(2) - el.setAttribute('alt', truealt) + + el.set('alt', truealt) return el class ReferencePattern(LinkPattern): - """ Match to a stored reference and return a NanoDom link Element. """ - def handleMatch(self, m, doc): + """ Match to a stored reference and return link element. """ + def handleMatch(self, m): if m.group(9): id = m.group(9).lower() @@ -982,35 +735,38 @@ class ReferencePattern(LinkPattern): if not self.references.has_key(id): # ignore undefined refs return None href, title = self.references[id] + text = m.group(2) - return self.makeTag(href, title, text, doc) + return self.makeTag(href, title, text) - def makeTag(self, href, title, text, doc): - el = doc.createElement('a') - el.setAttribute('href', self.sanatize_url(href)) + def makeTag(self, href, title, text): + el = etree.Element('a') + + el.set('href', self.sanatize_url(href)) if title: - el.setAttribute('title', title) - el.appendChild(doc.createTextNode(text)) + el.set('title', title) + + el.text = text return el class ImageReferencePattern (ReferencePattern): - """ Match to a stored reference and return a NanoDom img Element. """ - def makeTag(self, href, title, text, doc): - el = doc.createElement('img') - el.setAttribute('src', self.sanatize_url(href)) + """ Match to a stored reference and return img element. """ + def makeTag(self, href, title, text): + el = etree.Element("img") + el.set("src", self.sanatize_url(href)) if title: - el.setAttribute('title', title) - el.setAttribute('alt', text) + el.set("title", title) + el.set("alt", text) return el class AutolinkPattern (Pattern): """ Return a link Element given an autolink (`<http://example/com>`). """ - def handleMatch(self, m, doc): - el = doc.createElement('a') - el.setAttribute('href', m.group(2)) - el.appendChild(doc.createTextNode(m.group(2))) + def handleMatch(self, m): + el = etree.Element("a") + el.set('href', m.group(2)) + el.text = m.group(2) return el class AutomailPattern (Pattern): @@ -1018,37 +774,35 @@ class AutomailPattern (Pattern): Return a mailto link Element given an automail link (`<foo@example.com>`). """ - def handleMatch(self, m, doc): - el = doc.createElement('a') + def handleMatch(self, m): + el = etree.Element('a') email = m.group(2) if email.startswith("mailto:"): email = email[len("mailto:"):] + el.text = "" for letter in email: - entity = doc.createEntityReference("#%d" % ord(letter)) - el.appendChild(entity) + el.text += codepoint2name(ord(letter)) + mailto = "mailto:" + email - mailto = "".join(['&#%d;' % ord(letter) for letter in mailto]) - el.setAttribute('href', mailto) + mailto = "".join([AND_SUBSTITUTE + '#%d;' % + ord(letter) for letter in mailto]) + el.set('href', mailto) return el ESCAPE_PATTERN = SimpleTextPattern(ESCAPE_RE) NOT_STRONG_PATTERN = SimpleTextPattern(NOT_STRONG_RE) BACKTICK_PATTERN = BacktickPattern(BACKTICK_RE) -DOUBLE_BACKTICK_PATTERN = BacktickPattern(DOUBLE_BACKTICK_RE) STRONG_PATTERN = SimpleTagPattern(STRONG_RE, 'strong') -STRONG_PATTERN_2 = SimpleTagPattern(STRONG_2_RE, 'strong') EMPHASIS_PATTERN = SimpleTagPattern(EMPHASIS_RE, 'em') EMPHASIS_PATTERN_2 = SimpleTagPattern(EMPHASIS_2_RE, 'em') STRONG_EM_PATTERN = DoubleTagPattern(STRONG_EM_RE, 'strong,em') -STRONG_EM_PATTERN_2 = DoubleTagPattern(STRONG_EM_2_RE, 'strong,em') LINE_BREAK_PATTERN = SubstituteTagPattern(LINE_BREAK_RE, 'br ') LINE_BREAK_PATTERN_2 = SubstituteTagPattern(LINE_BREAK_2_RE, 'br ') LINK_PATTERN = LinkPattern(LINK_RE) -LINK_ANGLED_PATTERN = LinkPattern(LINK_ANGLED_RE) IMAGE_LINK_PATTERN = ImagePattern(IMAGE_LINK_RE) IMAGE_REFERENCE_PATTERN = ImageReferencePattern(IMAGE_REFERENCE_RE) REFERENCE_PATTERN = ReferencePattern(REFERENCE_RE) @@ -1075,10 +829,10 @@ There are two types of post-processors: Postprocessor and TextPostprocessor class Postprocessor: """ - Postprocessors are run before the dom it converted back into text. + Postprocessors are run before the ElementTree serialization. Each Postprocessor implements a "run" method that takes a pointer to a - NanoDom document, modifies it as necessary and returns a NanoDom + ElementTree, modifies it as necessary and returns a ElementTree document. Postprocessors must extend markdown.Postprocessor. @@ -1088,11 +842,10 @@ class Postprocessor: """ - def run(self, dom): + def run(self, et): """ Subclasses of Postprocessor should implement a `run` method, which - takes a NanoDOm document and returns a (possably modified) NanoDom - document. + takes a ElementTree and returns a (possably modified) ElementTree. """ pass @@ -1101,7 +854,7 @@ class Postprocessor: class TextPostprocessor: """ - TextPostprocessors are run after the dom it converted back into text. + TextPostprocessors are run after the ElementTree it converted back into text. Each TextPostprocessor implements a "run" method that takes a pointer to a text string, modifies it as necessary and returns a text string. @@ -1152,6 +905,20 @@ class RawHtmlTextPostprocessor(TextPostprocessor): RAWHTMLTEXTPOSTPROCESSOR = RawHtmlTextPostprocessor() + +class AndSubstitutePostprocessor(TextPostprocessor): + """ Restore valid entities """ + def __init__(self): + pass + + def run(self, text): + + text = text.replace(AND_SUBSTITUTE, "&") + return text + +ANDSUBSTITUTETEXTPOSTPROCESSOR = AndSubstitutePostprocessor() + + """ ====================================================================== ========================== MISC AUXILIARY CLASSES ==================== @@ -1280,7 +1047,47 @@ def dequote(string): return string[1:-1] else: return string - + + +class InlineStash: + + def __init__(self): + self.prefix = INLINE_PLACEHOLDER_PREFIX + self.suffix = INLINE_PLACEHOLDER_SUFFIX + self._nodes = {} + self.phLength = 4 + len(self.prefix) + len(self.suffix) + + def _genPlaceholder(self, type): + """ Generates placeholder """ + id = "%04d" % len(self._nodes) + hash = "%s%s:%s%s" % (self.prefix, type, id, + self.suffix) + return hash, id + + def extractId(self, data, index): + """ Extracting id from data string, starting from index """ + endIndex = data.find(self.suffix, index+1) + if endIndex == -1: + return None, index + 1 + else: + pair = data[index + len(self.prefix): endIndex].split(":") + if len(pair) == 2: + return pair[1], endIndex + len(self.suffix) + else: + return None, index + 1 + + def isin(self, id): + return self._nodes.has_key(id) + + def get(self, id): + """ Returns node by id """ + return self._nodes.get(id) + + def add(self, node, type): + pholder, id = self._genPlaceholder(type) + self._nodes[id] = node + return pholder + """ ====================================================================== ========================== CORE MARKDOWN ============================= @@ -1318,6 +1125,7 @@ class CorePatterns: re.DOTALL) self.regExp['containsline'] = re.compile(r'^([-]*)$|^([=]*)$', re.M) + self.regExp['attr'] = re.compile("\{@([^\}]*)=([^\}]*)}") # {@id=123} RE = CorePatterns() @@ -1353,8 +1161,8 @@ class Markdown: self.safeMode = safe_mode self.blockGuru = BlockGuru() self.registeredExtensions = [] - self.stripTopLevelTags = 1 self.docType = "" + self.stripTopLevelTags = True self.textPreprocessors = [HTML_BLOCK_PREPROCESSOR] @@ -1370,20 +1178,19 @@ class Markdown: self.textPostprocessors = [# a footnote postprocessor will get # inserted here - RAWHTMLTEXTPOSTPROCESSOR] + RAWHTMLTEXTPOSTPROCESSOR, + ANDSUBSTITUTETEXTPOSTPROCESSOR] self.prePatterns = [] - - - self.inlinePatterns = [DOUBLE_BACKTICK_PATTERN, + + self.inlinePatterns = [ BACKTICK_PATTERN, ESCAPE_PATTERN, REFERENCE_PATTERN, - LINK_ANGLED_PATTERN, LINK_PATTERN, IMAGE_LINK_PATTERN, - IMAGE_REFERENCE_PATTERN, - AUTOLINK_PATTERN, + IMAGE_REFERENCE_PATTERN, + AUTOLINK_PATTERN, AUTOMAIL_PATTERN, LINE_BREAK_PATTERN_2, LINE_BREAK_PATTERN, @@ -1391,13 +1198,15 @@ class Markdown: ENTITY_PATTERN, NOT_STRONG_PATTERN, STRONG_EM_PATTERN, - STRONG_EM_PATTERN_2, STRONG_PATTERN, - STRONG_PATTERN_2, EMPHASIS_PATTERN, EMPHASIS_PATTERN_2 # The order of the handlers matters!!! ] + + self.inlineStash = InlineStash() + + self._inlineOperationID = None self.registerExtensions(extensions = extensions, configs = extension_configs) @@ -1440,6 +1249,7 @@ class Markdown: """ self.references={} self.htmlStash = HtmlStash() + self.inlineStash = InlineStash() HTML_BLOCK_PREPROCESSOR.stash = self.htmlStash LINE_PREPROCESSOR.stash = self.htmlStash @@ -1461,17 +1271,13 @@ class Markdown: def _transform(self): """Transform the Markdown text into a XHTML body document. - Returns: A NanoDom Document + Returns: ElementTree object """ # Setup the document - - self.doc = Document() - self.top_element = self.doc.createElement("span") - self.top_element.appendChild(self.doc.createTextNode('\n')) - self.top_element.setAttribute('class', 'markdown') - self.doc.appendChild(self.top_element) + + self.root = etree.Element("span") # Split into lines and run the preprocessors that will work with # self.lines @@ -1482,32 +1288,24 @@ class Markdown: for prep in self.preprocessors : self.lines = prep.run(self.lines) - # Create a NanoDom tree from the lines and attach it to Document - + # Create a ElementTree from the lines buffer = [] for line in self.lines: if line.startswith("#"): - self._processSection(self.top_element, buffer) + + self._processSection(self.root, buffer) buffer = [line] else: buffer.append(line) - self._processSection(self.top_element, buffer) - - #self._processSection(self.top_element, self.lines) - - # Not sure why I put this in but let's leave it for now. - self.top_element.appendChild(self.doc.createTextNode('\n')) - - # Run the post-processors - for postprocessor in self.postprocessors: - postprocessor.run(self.doc) - return self.doc + self._processSection(self.root, buffer) + + return etree.ElementTree(self.root) def _processSection(self, parent_elem, lines, - inList = 0, looseList = 0): + inList=0, looseList=0): """ Process a section of a source document, looking for high level structural elements like lists, block quotes, code @@ -1517,17 +1315,22 @@ class Markdown: Keyword arguments: - * parent_elem: A NanoDom element to which the content will be added. + * parent_elem: A ElementTree element to which the content will be added. * lines: a list of lines * inList: a level Returns: None """ - + # Loop through lines until none left. while lines: - + + # Skipping empty line + if not lines[0]: + lines = lines[1:] + continue + # Check if this section starts with a list, a blockquote or # a code block @@ -1571,11 +1374,17 @@ class Markdown: else: # Ok, so it's just a simple block paragraph, lines = self._linesUntil(lines, lambda line: - not line.strip()) + not line.strip() or line[0] == '>') if len(paragraph) and paragraph[0].startswith('#'): self._processHeader(parent_elem, paragraph) + + elif len(paragraph) and \ + RE.regExp["isline3"].match(paragraph[0]): + self._processHR(parent_elem) + lines = paragraph[1:] + lines + elif paragraph: self._processParagraph(parent_elem, paragraph, inList, looseList) @@ -1583,48 +1392,59 @@ class Markdown: if lines and not lines[0].strip(): lines = lines[1:] # skip the first (blank) line - - def _processHeader(self, parent_elem, paragraph): + def _processHR(self, parentElem): + hr = etree.SubElement(parentElem, "hr") + + def _processHeader(self, parentElem, paragraph): m = RE.regExp['header'].match(paragraph[0]) if m: level = len(m.group(1)) - h = self.doc.createElement("h%d" % level) - parent_elem.appendChild(h) - for item in self._handleInline(m.group(2).strip()): - h.appendChild(item) + h = etree.SubElement(parentElem, "h%d" % level) + inline = etree.SubElement(h, "inline") + inline.text = m.group(2).strip() else: message(CRITICAL, "We've got a problem header!") - def _processParagraph(self, parent_elem, paragraph, inList, looseList): - list = self._handleInline("\n".join(paragraph)) + def _processParagraph(self, parentElem, paragraph, inList, looseList): - if ( parent_elem.nodeName == 'li' - and not (looseList or parent_elem.childNodes)): + if ( parentElem.tag == 'li' + and not (looseList or parentElem.getchildren())): # If this is the first paragraph inside "li", don't # put <p> around it - append the paragraph bits directly - # onto parent_elem - el = parent_elem + # onto parentElem + el = parentElem else: # Otherwise make a "p" element - el = self.doc.createElement("p") - parent_elem.appendChild(el) + el = etree.SubElement(parentElem, "p") - for item in list: - el.appendChild(item) - - - def _processUList(self, parent_elem, lines, inList): - self._processList(parent_elem, lines, inList, + dump = [] + + # Searching for hr + for line in paragraph: + if RE.regExp["isline3"].match(line): + inline = etree.SubElement(el, "inline") + inline.text = "\n".join(dump) + etree.SubElement(el, "hr") + dump.clear() + else: + dump.append(line) + if dump: + text = "\n".join(dump) + inline = etree.SubElement(el, "inline") + inline.text = text + + def _processUList(self, parentElem, lines, inList): + self._processList(parentElem, lines, inList, listexpr='ul', tag = 'ul') - def _processOList(self, parent_elem, lines, inList): - self._processList(parent_elem, lines, inList, + def _processOList(self, parentElem, lines, inList): + self._processList(parentElem, lines, inList, listexpr='ol', tag = 'ol') - def _processList(self, parent_elem, lines, inList, listexpr, tag): + def _processList(self, parentElem, lines, inList, listexpr, tag): """ Given a list of document lines starting with a list item, finds the end of the list, breaks it up, and recursively @@ -1632,7 +1452,7 @@ class Markdown: Keyword arguments: - * parent_elem: A dom element to which the content will be added + * parentElem: A ElementTree element to which the content will be added * lines: a list of lines * inList: a level @@ -1640,8 +1460,7 @@ class Markdown: """ - ul = self.doc.createElement(tag) # ul might actually be '<ol>' - parent_elem.appendChild(ul) + ul = etree.SubElement(parentElem, tag) # ul might actually be '<ol>' looseList = 0 @@ -1669,6 +1488,7 @@ class Markdown: break # Check if the next non-blank line is still a part of the list + if ( RE.regExp[listexpr].match(next) or RE.regExp['tabbed'].match(next) ): # get rid of any white space in the line @@ -1702,16 +1522,15 @@ class Markdown: else: i += 1 - # Add the dom elements + # Add the ElementTree elements for item in items: - li = self.doc.createElement("li") - ul.appendChild(li) + li = etree.SubElement(ul, "li") self._processSection(li, item, inList + 1, looseList = looseList) # Process the remaining part of the section - self._processSection(parent_elem, lines[i:], inList) + self._processSection(parentElem, lines[i:], inList) def _linesUntil(self, lines, condition): @@ -1725,12 +1544,13 @@ class Markdown: i = -1 for line in lines: i += 1 - if condition(line): break + if condition(line): + break else: i += 1 return lines[:i], lines[i:] - def _processQuote(self, parent_elem, lines, inList): + def _processQuote(self, parentElem, lines, inList): """ Given a list of document lines starting with a quote finds the end of the quote, unindents it and recursively @@ -1739,7 +1559,7 @@ class Markdown: Keyword arguments: - * parent_elem: DOM element to which the content will be added + * parentElem: ElementTree element to which the content will be added * lines: a list of lines * inList: a level @@ -1766,25 +1586,24 @@ class Markdown: else: break - blockquote = self.doc.createElement('blockquote') - parent_elem.appendChild(blockquote) + blockquote = etree.SubElement(parentElem, "blockquote") self._processSection(blockquote, dequoted, inList) - self._processSection(parent_elem, lines[i:], inList) + self._processSection(parentElem, lines[i:], inList) - def _processCodeBlock(self, parent_elem, lines, inList): + def _processCodeBlock(self, parentElem, lines, inList): """ Given a list of document lines starting with a code block - finds the end of the block, puts it into the dom verbatim + finds the end of the block, puts it into the ElementTree verbatim wrapped in ("<pre><code>") and recursively processes the the remainder of the text file. Keyword arguments: - * parent_elem: DOM element to which the content will be added + * parentElem: ElementTree element to which the content will be added * lines: a list of lines * inList: a level @@ -1794,163 +1613,264 @@ class Markdown: detabbed, theRest = self.blockGuru.detectTabbed(lines) - pre = self.doc.createElement('pre') - code = self.doc.createElement('code') - parent_elem.appendChild(pre) - pre.appendChild(code) + pre = etree.SubElement(parentElem, "pre") + code = etree.SubElement(pre, "code") + text = "\n".join(detabbed).rstrip()+"\n" - #text = text.replace("&", "&") - code.appendChild(self.doc.createTextNode(text)) - self._processSection(parent_elem, theRest, inList) - - - - def _handleInline (self, line, patternIndex=0): + code.text = text + self._processSection(parentElem, theRest, inList) + + def _handleInline(self, data, patternIndex=0): """ - Transform a Markdown line with inline elements to an XHTML - fragment. - - This function uses auxiliary objects called inline patterns. - See notes on inline patterns above. + Processinf string with inline patterns and replasing it + with placeholders Keyword arguments: - * line: A line of Markdown text + * data: A line of Markdown text * patternIndex: The index of the inlinePattern to start with - Return: A list of NanoDom nodes + Return: String with placeholders. """ - - - parts = [line] - - while patternIndex < len(self.inlinePatterns): - - i = 0 - - while i < len(parts): - - x = parts[i] - - if isinstance(x, (str, unicode)): - result = self._applyPattern(x, \ - self.inlinePatterns[patternIndex], \ - patternIndex) - - if result: - i -= 1 - parts.remove(x) - for y in result: - parts.insert(i+1,y) - - i += 1 - patternIndex += 1 - - for i in range(len(parts)): - x = parts[i] - if isinstance(x, (str, unicode)): - parts[i] = self.doc.createTextNode(x) - - return parts + startIndex = 0 - - def _applyPattern(self, line, pattern, patternIndex): - + while patternIndex < len(self.inlinePatterns): + + data, matched, startIndex = self._applyInline( + self.inlinePatterns[patternIndex], + data, patternIndex, startIndex) + if not matched: + patternIndex += 1 + return data + + def _applyInline(self, pattern, data, patternIndex, startIndex=0): """ Given a pattern name, this function checks if the line - fits the pattern, creates the necessary elements, and returns - back a list consisting of NanoDom elements and/or strings. + fits the pattern, creates the necessary elements, adds it + to InlineStash, and returns string with placeholders, + instead of ElementTree elements. Keyword arguments: - * line: the text to be processed + * data: the text to be processed * pattern: the pattern to be checked + * patternIndex: index of current pattern + * startIndex: string index, from wich we starting search - Returns: The appropriate newly created NanoDom element if the - pattern matches, None otherwise. + Returns: String with placeholders. """ + match = pattern.getCompiledRegExp().match(data[startIndex:]) + leftData = data[:startIndex] + + if not match: + return data, False, 0 - # match the line to pattern's pre-compiled reg exp. - # if no match, move on. - - - - m = pattern.getCompiledRegExp().match(line) - if not m: - return None - - # if we got a match let the pattern make us a NanoDom node - # if it doesn't, move on - node = pattern.handleMatch(m, self.doc) - - # check if any of this nodes have children that need processing - - if isinstance(node, Element): - - if not node.nodeName in ["code", "pre"]: - for child in node.childNodes: - if isinstance(child, TextNode): - - result = self._handleInline(child.value, patternIndex+1) - - if result: - - if result == [child]: - continue - - result.reverse() - #to make insertion easier + node = pattern.handleMatch(match) + + if node is None: + return data, True, len(leftData) + match.span(len(match.groups()))[0] + + if not isstr(node): + if not node.tag in ["code", "pre"]: + # We need to process current node too + for child in [node] + node.getchildren(): + if not isstr(node): + if child.text: + child.text = self._handleInline(child.text, + patternIndex + 1) + if child.tail: + child.tail = self._handleInline(child.tail, + patternIndex) + + pholder = self.inlineStash.add(node, pattern.type()) + + return "%s%s%s%s" % (leftData, + match.group(1), + pholder, match.groups()[-1]), True, 0 + + def _processElementText(self, node, subnode, isText=True): + + if isText: + text = subnode.text + subnode.text = None + else: + text = subnode.tail + subnode.tail = None + + childResult = self._processPlaceholders(text, subnode) + + if not isText and node is not subnode: + pos = node.getchildren().index(subnode) + node.remove(subnode) + else: + pos = 0 + + childResult.reverse() + for newChild in childResult: + node.insert(pos, newChild) + + def _processPlaceholders(self, data, parent): + """ + Processes string with placeholders and generates ElementTree tree. + + * data: string with placeholders instead of ElementTree elements. - position = node.childNodes.index(child) + Returns: list with ElementTree elements with applied inline patterns. + """ + + def linkText(text): + if text: + if result: + if result[-1].tail: + result[-1].tail += text + else: + result[-1].tail = text + else: + if parent.text: + parent.text += text + else: + parent.text = text + + result = [] + prefix = self.inlineStash.prefix + strartIndex = 0 + while data: + + index = data.find(prefix, strartIndex) + if index != -1: + + id, phEndIndex = self.inlineStash.extractId(data, index) + + if self.inlineStash.isin(id): + + node = self.inlineStash.get(id) + + if index > 0: + text = data[strartIndex:index] + linkText(text) + + if not isstr(node): # it's Element + + for child in [node] + node.getchildren(): + + if child.tail: + self._processElementText(node, child, False) - node.removeChild(child) - - for item in result: - - if isinstance(item, (str, unicode)): - if len(item) > 0: - node.insertChild(position, - self.doc.createTextNode(item)) - else: - node.insertChild(position, item) + if child.text: + self._processElementText(child, child) + + else: # it's just a string + linkText(node) + strartIndex = phEndIndex + continue + + strartIndex = phEndIndex + result.append(node) + + else: # wrong placeholder + end = index + len(prefix) + linkText(data[strartIndex:end]) + strartIndex = end + else: + + text = data[strartIndex:] + linkText(text) + data = "" + return result + + def _processTree(self, el): + """ + Processing ElementTree, and applying inline patterns + + Keyword arguments: + + * el - parent element of ElementTree. + Returns: ElementTree object with applied inline patterns. + """ - if node: - # Those are in the reverse order! - return ( m.groups()[-1], # the string to the left - node, # the new node - m.group(1)) # the string to the right of the match - - else: - return None + stack = [el] + while stack: + currElement = stack.pop() + insertQueue = [] + for child in currElement.getchildren(): + + if child.tag == "inline": + + lst = self._processPlaceholders(self._handleInline( + child.text), currElement) + + pos = currElement.getchildren().index(child) + + insertQueue.append((child, pos, lst)) + + else: + stack.append(child) + + + for element, pos, lst in insertQueue: + currElement.remove(element) + if currElement.text: + currElement.text = handleAttributes(currElement.text, + currElement) + for newChild in lst: + # Processing attributes + if newChild.tail: + newChild.tail = handleAttributes(newChild.tail, + currElement) + if newChild.text: + newChild.text = handleAttributes(newChild.text, + newChild) + currElement.insert(pos, newChild) + pos += 1 + + + def applyInlinePatterns(self, markdownTree): + """ + Retrun ElementTree, with applied + inline paterns + + Keyword arguments: + + * markdownTree: ElementTree object, reppresenting Markdown tree. - def convert (self, source=None): + Returns: ElementTree object. """ - Return the document in XHTML format. + + el = markdownTree.getroot() + + self._processTree(el) + + return markdownTree + + def markdownToTree(self, source=None): + """ + Retrun ElementTree, without applying inline paterns, + all data, that should be processed with + inline patterns included in <inline></inline> sections. + Keyword arguments: * source: An ascii or unicode string of Markdown formated text. - Returns: A serialized XHTML body. - + Returns: ElementTree object. """ - if source is not None: #Allow blank string self.source = source - + if not self.source: return u"" - + try: self.source = unicode(self.source) except UnicodeDecodeError: message(CRITICAL, 'UnicodeDecodeError: Markdown only accepts unicode or ascii input.') return u"" - + # Fixup the source text self.source = self.source.replace(START, "") @@ -1962,20 +1882,48 @@ class Markdown: for pp in self.textPreprocessors: self.source = pp.run(self.source) + + markdownTree = self._transform() + + return markdownTree + + - doc = self._transform() - xml = doc.toxml() + def convert (self, source=None): + """ + Return the document in XHTML format. + Keyword arguments: + + * source: An ascii or unicode string of Markdown formated text. - # Return everything but the top level tag + Returns: A serialized XHTML body. + """ + + tree = self.markdownToTree(source) + + root = self.applyInlinePatterns(tree).getroot() + + # Run the post-processors + for postprocessor in self.postprocessors: + postprocessor.stash = self.htmlStash + newRoot = postprocessor.run(root) + if newRoot: + root = newRoot + + indentETree(root) + + xml = codecs.decode(etree.tostring(root, encoding="utf8"), "utf8") + if self.stripTopLevelTags: - xml = xml.strip()[23:-7] + "\n" + xml = xml.strip()[44:-7] + "\n" + # Run the text post-processors for pp in self.textPostprocessors: xml = pp.run(xml) - return (self.docType + xml).strip() + return xml.strip() def __str__(self): |