From 53f95cde694975e556db7569d3bd89f84459e7bb Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Sun, 19 Oct 2008 23:22:05 -0400 Subject: Made InlineProcessor a TreeProcessor. Now an extension can manipulate the tree either before or after InlinePatterns are run. Updated docs as well. This change should not affect existing extensions. --- markdown.py | 734 ++++++++++++++++++++++++++++++------------------------------ 1 file changed, 365 insertions(+), 369 deletions(-) (limited to 'markdown.py') diff --git a/markdown.py b/markdown.py index ae179b4..437530c 100755 --- a/markdown.py +++ b/markdown.py @@ -576,277 +576,6 @@ class MarkdownParser: return items, lines[i:] -""" -INLINE PROCESSOR -============================================================================= - -This class handles basic Markdown parsing. It doesn't concern itself with -inline elements such as **bold** or *italics*, but rather just catches blocks, -lists, quotes, etc. -""" - -class InlineProcessor: - """ - An auxiliary class to traverse a Markdown tree, applying inline patterns. - """ - - def __init__ (self, patterns): - self.__inlinePatterns = patterns - self.__placeholder_prefix = INLINE_PLACEHOLDER_PREFIX - self.__placeholder_suffix = ETX - self.__placeholder_length = 4 + len(self.__placeholder_prefix) \ - + len(self.__placeholder_suffix) - self.__placeholder_re = re.compile(INLINE_PLACEHOLDER % r'([0-9]{4})') - - def __makePlaceholder(self, type): - """ Generate a placeholder """ - id = "%04d" % len(self.stashed_nodes) - hash = INLINE_PLACEHOLDER % id - return hash, id - - def __findPlaceholder(self, data, index): - """ - Extract id from data string, start from index - - Keyword arguments: - - * data: string - * index: index, from which we start search - - Returns: placeholder id and string index, after the found placeholder. - """ - - m = self.__placeholder_re.search(data, index) - if m: - return m.group(1), m.end() - else: - return None, index + 1 - - def __stashNode(self, node, type): - """ Add node to stash """ - placeholder, id = self.__makePlaceholder(type) - self.stashed_nodes[id] = node - return placeholder - - def __handleInline(self, data, patternIndex=0): - """ - Process string with inline patterns and replace it - with placeholders - - Keyword arguments: - - * data: A line of Markdown text - * patternIndex: The index of the inlinePattern to start with - - Returns: String with placeholders. - - """ - if not isinstance(data, AtomicString): - startIndex = 0 - while patternIndex < len(self.__inlinePatterns): - data, matched, startIndex = self.__applyPattern( - self.__inlinePatterns[patternIndex], - data, patternIndex, startIndex) - if not matched: - patternIndex += 1 - return data - - def __processElementText(self, node, subnode, isText=True): - """ - Process placeholders in Element.text or Element.tail - of Elements popped from self.stashed_nodes. - - Keywords arguments: - - * node: parent node - * subnode: processing node - * isText: bool variable, True - it's text, False - it's tail - - Returns: None - - """ - if isText: - text = subnode.text - subnode.text = None - else: - text = subnode.tail - subnode.tail = None - - childResult = self.__processPlaceholders(text, subnode) - - if not isText and node is not subnode: - pos = node.getchildren().index(subnode) - node.remove(subnode) - else: - pos = 0 - - childResult.reverse() - for newChild in childResult: - node.insert(pos, newChild) - - def __processPlaceholders(self, data, parent): - """ - Process string with placeholders and generate ElementTree tree. - - Keyword arguments: - - * data: string with placeholders instead of ElementTree elements. - * parent: Element, which contains processing inline data - - Returns: list with ElementTree elements with applied inline patterns. - """ - def linkText(text): - if text: - if result: - if result[-1].tail: - result[-1].tail += text - else: - result[-1].tail = text - else: - if parent.text: - parent.text += text - else: - parent.text = text - - result = [] - strartIndex = 0 - while data: - index = data.find(self.__placeholder_prefix, strartIndex) - if index != -1: - id, phEndIndex = self.__findPlaceholder(data, index) - - if self.stashed_nodes.has_key(id): - node = self.stashed_nodes.get(id) - - if index > 0: - text = data[strartIndex:index] - linkText(text) - - if not isString(node): # it's Element - for child in [node] + node.getchildren(): - if child.tail: - if child.tail.strip(): - self.__processElementText(node, child, False) - if child.text: - if child.text.strip(): - self.__processElementText(child, child) - else: # it's just a string - linkText(node) - strartIndex = phEndIndex - continue - - strartIndex = phEndIndex - result.append(node) - - else: # wrong placeholder - end = index + len(prefix) - linkText(data[strartIndex:end]) - strartIndex = end - else: - text = data[strartIndex:] - linkText(text) - data = "" - - return result - - def __applyPattern(self, pattern, data, patternIndex, startIndex=0): - """ - Check if the line fits the pattern, create the necessary - elements, add it to stashed_nodes. - - Keyword arguments: - - * data: the text to be processed - * pattern: the pattern to be checked - * patternIndex: index of current pattern - * startIndex: string index, from which we starting search - - Returns: String with placeholders instead of ElementTree elements. - - """ - match = pattern.getCompiledRegExp().match(data[startIndex:]) - leftData = data[:startIndex] - - if not match: - return data, False, 0 - - node = pattern.handleMatch(match) - - if node is None: - return data, True, len(leftData) + match.span(len(match.groups()))[0] - - if not isString(node): - if not isinstance(node.text, AtomicString): - # We need to process current node too - for child in [node] + node.getchildren(): - if not isString(node): - if child.text: - child.text = self.__handleInline(child.text, - patternIndex + 1) - if child.tail: - child.tail = self.__handleInline(child.tail, - patternIndex) - - placeholder = self.__stashNode(node, pattern.type()) - - return "%s%s%s%s" % (leftData, - match.group(1), - placeholder, match.groups()[-1]), True, 0 - - def applyInlinePatterns(self, markdownTree): - """Apply inline patterns to a parsed Markdown tree. - - Iterate over ElementTree, find elements with inline tag, apply inline - patterns and append newly created Elements to tree. If you don't - want process your data with inline paterns, instead of normal string, - use subclass AtomicString: - - node.text = AtomicString("data won't be processed with inline patterns") - - Arguments: - - * markdownTree: ElementTree object, representing Markdown tree. - - Returns: ElementTree object with applied inline patterns. - - """ - self.stashed_nodes = {} - - stack = [markdownTree.getroot()] - - while stack: - currElement = stack.pop() - insertQueue = [] - for child in currElement.getchildren(): - if child.text and not isinstance(child.text, AtomicString): - text = child.text - child.text = None - lst = self.__processPlaceholders(self.__handleInline( - text), child) - stack += lst - insertQueue.append((child, lst)) - - if child.getchildren(): - stack.append(child) - - for element, lst in insertQueue: - if element.text: - element.text = handleAttributes(element.text, element) - i = 0 - for newChild in lst: - # Processing attributes - if newChild.tail: - newChild.tail = handleAttributes(newChild.tail, - element) - if newChild.text: - newChild.text = handleAttributes(newChild.text, - newChild) - element.insert(i, newChild) - i += 1 - - return markdownTree - - """ PRE-PROCESSORS ============================================================================= @@ -1375,132 +1104,373 @@ class ImagePattern(LinkPattern): el.set('alt', truealt) return el -class ReferencePattern(LinkPattern): - """ Match to a stored reference and return link element. """ - def handleMatch(self, m): - if m.group(9): - id = m.group(9).lower() +class ReferencePattern(LinkPattern): + """ Match to a stored reference and return link element. """ + def handleMatch(self, m): + if m.group(9): + id = m.group(9).lower() + else: + # if we got something like "[Google][]" + # we'll use "google" as the id + id = m.group(2).lower() + + if not self.markdown.references.has_key(id): # ignore undefined refs + return None + href, title = self.markdown.references[id] + + text = m.group(2) + return self.makeTag(href, title, text) + + def makeTag(self, href, title, text): + el = etree.Element('a') + + el.set('href', self.sanitize_url(href)) + if title: + el.set('title', title) + + el.text = text + return el + + +class ImageReferencePattern (ReferencePattern): + """ Match to a stored reference and return img element. """ + def makeTag(self, href, title, text): + el = etree.Element("img") + el.set("src", self.sanitize_url(href)) + if title: + el.set("title", title) + el.set("alt", text) + return el + + +class AutolinkPattern (Pattern): + """ Return a link Element given an autolink (``). """ + def handleMatch(self, m): + el = etree.Element("a") + el.set('href', m.group(2)) + el.text = AtomicString(m.group(2)) + return el + +class AutomailPattern (Pattern): + """ + Return a mailto link Element given an automail link (``). + """ + def handleMatch(self, m): + el = etree.Element('a') + email = m.group(2) + if email.startswith("mailto:"): + email = email[len("mailto:"):] + + def codepoint2name(code): + """Return entity definition by code, or the code if not defined.""" + entity = htmlentitydefs.codepoint2name.get(code) + if entity: + return "%s%s;" % (AMP_SUBSTITUTE, entity) + else: + return "%s#%d;" % (AMP_SUBSTITUTE, code) + + letters = [codepoint2name(ord(letter)) for letter in email] + el.text = AtomicString(''.join(letters)) + + mailto = "mailto:" + email + mailto = "".join([AMP_SUBSTITUTE + '#%d;' % + ord(letter) for letter in mailto]) + el.set('href', mailto) + return el + + +""" +POST-PROCESSORS +============================================================================= + +Markdown also allows post-processors, which are similar to preprocessors in +that they need to implement a "run" method. However, they are run after core +processing. + +There are two types of post-processors: Treeprocessor and Postprocessor +""" + +class Treeprocessor(Processor): + """ + Treeprocessors are run on the ElementTree object before serialization. + + Each Treeprocessor implements a "run" method that takes a pointer to an + ElementTree, modifies it as necessary and returns an ElementTree + object. + + Treeprocessors must extend markdown.Treeprocessor. + + """ + def run(self, root): + """ + Subclasses of Treeprocessor should implement a `run` method, which + takes a root ElementTree. This method can return another ElementTree + object, and the existing root ElementTree will be replaced, or it can + modify the current tree and return None. + """ + pass + + +class InlineProcessor(Treeprocessor): + """ + A Treeprocessor that traverses a tree, applying inline patterns. + """ + + def __init__ (self, md): + #self.__inlinePatterns = patterns + self.__placeholder_prefix = INLINE_PLACEHOLDER_PREFIX + self.__placeholder_suffix = ETX + self.__placeholder_length = 4 + len(self.__placeholder_prefix) \ + + len(self.__placeholder_suffix) + self.__placeholder_re = re.compile(INLINE_PLACEHOLDER % r'([0-9]{4})') + + def __makePlaceholder(self, type): + """ Generate a placeholder """ + id = "%04d" % len(self.stashed_nodes) + hash = INLINE_PLACEHOLDER % id + return hash, id + + def __findPlaceholder(self, data, index): + """ + Extract id from data string, start from index + + Keyword arguments: + + * data: string + * index: index, from which we start search + + Returns: placeholder id and string index, after the found placeholder. + """ + + m = self.__placeholder_re.search(data, index) + if m: + return m.group(1), m.end() + else: + return None, index + 1 + + def __stashNode(self, node, type): + """ Add node to stash """ + placeholder, id = self.__makePlaceholder(type) + self.stashed_nodes[id] = node + return placeholder + + def __handleInline(self, data, patternIndex=0): + """ + Process string with inline patterns and replace it + with placeholders + + Keyword arguments: + + * data: A line of Markdown text + * patternIndex: The index of the inlinePattern to start with + + Returns: String with placeholders. + + """ + if not isinstance(data, AtomicString): + startIndex = 0 + while patternIndex < len(self.patterns): + data, matched, startIndex = self.__applyPattern( + self.patterns[patternIndex], + data, patternIndex, startIndex) + if not matched: + patternIndex += 1 + return data + + def __processElementText(self, node, subnode, isText=True): + """ + Process placeholders in Element.text or Element.tail + of Elements popped from self.stashed_nodes. + + Keywords arguments: + + * node: parent node + * subnode: processing node + * isText: bool variable, True - it's text, False - it's tail + + Returns: None + + """ + if isText: + text = subnode.text + subnode.text = None + else: + text = subnode.tail + subnode.tail = None + + childResult = self.__processPlaceholders(text, subnode) + + if not isText and node is not subnode: + pos = node.getchildren().index(subnode) + node.remove(subnode) else: - # if we got something like "[Google][]" - # we'll use "google" as the id - id = m.group(2).lower() + pos = 0 - if not self.markdown.references.has_key(id): # ignore undefined refs - return None - href, title = self.markdown.references[id] + childResult.reverse() + for newChild in childResult: + node.insert(pos, newChild) - text = m.group(2) - return self.makeTag(href, title, text) + def __processPlaceholders(self, data, parent): + """ + Process string with placeholders and generate ElementTree tree. - def makeTag(self, href, title, text): - el = etree.Element('a') + Keyword arguments: - el.set('href', self.sanitize_url(href)) - if title: - el.set('title', title) + * data: string with placeholders instead of ElementTree elements. + * parent: Element, which contains processing inline data - el.text = text - return el + Returns: list with ElementTree elements with applied inline patterns. + """ + def linkText(text): + if text: + if result: + if result[-1].tail: + result[-1].tail += text + else: + result[-1].tail = text + else: + if parent.text: + parent.text += text + else: + parent.text = text + result = [] + strartIndex = 0 + while data: + index = data.find(self.__placeholder_prefix, strartIndex) + if index != -1: + id, phEndIndex = self.__findPlaceholder(data, index) -class ImageReferencePattern (ReferencePattern): - """ Match to a stored reference and return img element. """ - def makeTag(self, href, title, text): - el = etree.Element("img") - el.set("src", self.sanitize_url(href)) - if title: - el.set("title", title) - el.set("alt", text) - return el + if self.stashed_nodes.has_key(id): + node = self.stashed_nodes.get(id) + if index > 0: + text = data[strartIndex:index] + linkText(text) -class AutolinkPattern (Pattern): - """ Return a link Element given an autolink (``). """ - def handleMatch(self, m): - el = etree.Element("a") - el.set('href', m.group(2)) - el.text = AtomicString(m.group(2)) - return el + if not isString(node): # it's Element + for child in [node] + node.getchildren(): + if child.tail: + if child.tail.strip(): + self.__processElementText(node, child, False) + if child.text: + if child.text.strip(): + self.__processElementText(child, child) + else: # it's just a string + linkText(node) + strartIndex = phEndIndex + continue -class AutomailPattern (Pattern): - """ - Return a mailto link Element given an automail link (``). - """ - def handleMatch(self, m): - el = etree.Element('a') - email = m.group(2) - if email.startswith("mailto:"): - email = email[len("mailto:"):] + strartIndex = phEndIndex + result.append(node) - def codepoint2name(code): - """Return entity definition by code, or the code if not defined.""" - entity = htmlentitydefs.codepoint2name.get(code) - if entity: - return "%s%s;" % (AMP_SUBSTITUTE, entity) + else: # wrong placeholder + end = index + len(prefix) + linkText(data[strartIndex:end]) + strartIndex = end else: - return "%s#%d;" % (AMP_SUBSTITUTE, code) + text = data[strartIndex:] + linkText(text) + data = "" - letters = [codepoint2name(ord(letter)) for letter in email] - el.text = AtomicString(''.join(letters)) + return result - mailto = "mailto:" + email - mailto = "".join([AMP_SUBSTITUTE + '#%d;' % - ord(letter) for letter in mailto]) - el.set('href', mailto) - return el + def __applyPattern(self, pattern, data, patternIndex, startIndex=0): + """ + Check if the line fits the pattern, create the necessary + elements, add it to stashed_nodes. + Keyword arguments: -""" -POST-PROCESSORS -============================================================================= + * data: the text to be processed + * pattern: the pattern to be checked + * patternIndex: index of current pattern + * startIndex: string index, from which we starting search -Markdown also allows post-processors, which are similar to preprocessors in -that they need to implement a "run" method. However, they are run after core -processing. + Returns: String with placeholders instead of ElementTree elements. -There are two types of post-processors: Treeprocessor and Postprocessor -""" + """ + match = pattern.getCompiledRegExp().match(data[startIndex:]) + leftData = data[:startIndex] -class Treeprocessor(Processor): - """ - Treeprocessors are run on the ElementTree object before serialization. + if not match: + return data, False, 0 - Each Treeprocessor implements a "run" method that takes a pointer to an - ElementTree, modifies it as necessary and returns an ElementTree - object. + node = pattern.handleMatch(match) - Treeprocessors must extend markdown.Treeprocessor. + if node is None: + return data, True, len(leftData) + match.span(len(match.groups()))[0] - """ - def run(self, root): - """ - Subclasses of Treeprocessor should implement a `run` method, which - takes a root ElementTree. This method can return another ElementTree - object, and the existing root ElementTree will be replaced, or it can - modify the current tree and return None. - """ - pass + if not isString(node): + if not isinstance(node.text, AtomicString): + # We need to process current node too + for child in [node] + node.getchildren(): + if not isString(node): + if child.text: + child.text = self.__handleInline(child.text, + patternIndex + 1) + if child.tail: + child.tail = self.__handleInline(child.tail, + patternIndex) + placeholder = self.__stashNode(node, pattern.type()) -class Postprocessor(Processor): - """ - Postprocessors are run after the ElementTree it converted back into text. + return "%s%s%s%s" % (leftData, + match.group(1), + placeholder, match.groups()[-1]), True, 0 - Each Postprocessor implements a "run" method that takes a pointer to a - text string, modifies it as necessary and returns a text string. + def run(self, tree): + """Apply inline patterns to a parsed Markdown tree. - Postprocessors must extend markdown.Postprocessor. + Iterate over ElementTree, find elements with inline tag, apply inline + patterns and append newly created Elements to tree. If you don't + want process your data with inline paterns, instead of normal string, + use subclass AtomicString: - """ + node.text = AtomicString("data won't be processed with inline patterns") - def run(self, text): - """ - Subclasses of Postprocessor should implement a `run` method, which - takes the html document as a single text string and returns a - (possibly modified) string. + Arguments: + + * markdownTree: ElementTree object, representing Markdown tree. + + Returns: ElementTree object with applied inline patterns. """ - pass + self.stashed_nodes = {} + + stack = [tree] + + while stack: + currElement = stack.pop() + insertQueue = [] + for child in currElement.getchildren(): + if child.text and not isinstance(child.text, AtomicString): + text = child.text + child.text = None + lst = self.__processPlaceholders(self.__handleInline( + text), child) + stack += lst + insertQueue.append((child, lst)) + + if child.getchildren(): + stack.append(child) + + for element, lst in insertQueue: + if element.text: + element.text = handleAttributes(element.text, element) + i = 0 + for newChild in lst: + # Processing attributes + if newChild.tail: + newChild.tail = handleAttributes(newChild.tail, + element) + if newChild.text: + newChild.text = handleAttributes(newChild.text, + newChild) + element.insert(i, newChild) + i += 1 + + return tree class PrettifyTreeprocessor(Treeprocessor): @@ -1533,6 +1503,28 @@ class PrettifyTreeprocessor(Treeprocessor): br.tail = '\n%s' % br.tail +class Postprocessor(Processor): + """ + Postprocessors are run after the ElementTree it converted back into text. + + Each Postprocessor implements a "run" method that takes a pointer to a + text string, modifies it as necessary and returns a text string. + + Postprocessors must extend markdown.Postprocessor. + + """ + + def run(self, text): + """ + Subclasses of Postprocessor should implement a `run` method, which + takes the html document as a single text string and returns a + (possibly modified) string. + + """ + pass + + + class RawHtmlPostprocessor(Postprocessor): """ Restore raw html to the document. """ @@ -1801,6 +1793,7 @@ class Markdown: # footnote preprocessor will be inserted with "