From 53f95cde694975e556db7569d3bd89f84459e7bb Mon Sep 17 00:00:00 2001
From: Waylan Limberg <waylan@gmail.com>
Date: Sun, 19 Oct 2008 23:22:05 -0400
Subject: Made InlineProcessor a TreeProcessor. Now an extension can manipulate
 the tree either before or after InlinePatterns are run. Updated docs as well.
 This change should not affect existing extensions.

---
 markdown.py | 734 ++++++++++++++++++++++++++++++------------------------------
 1 file changed, 365 insertions(+), 369 deletions(-)

(limited to 'markdown.py')

diff --git a/markdown.py b/markdown.py
index ae179b4..437530c 100755
--- a/markdown.py
+++ b/markdown.py
@@ -576,277 +576,6 @@ class MarkdownParser:
         return items, lines[i:]
 
 
-"""
-INLINE PROCESSOR
-=============================================================================
-
-This class handles basic Markdown parsing.  It doesn't concern itself with
-inline elements such as **bold** or *italics*, but rather just catches blocks,
-lists, quotes, etc.
-"""
-
-class InlineProcessor:
-    """
-    An auxiliary class to traverse a Markdown tree, applying inline patterns.
-    """
-
-    def __init__ (self, patterns):
-        self.__inlinePatterns = patterns
-        self.__placeholder_prefix = INLINE_PLACEHOLDER_PREFIX
-        self.__placeholder_suffix = ETX
-        self.__placeholder_length = 4 + len(self.__placeholder_prefix) \
-                                      + len(self.__placeholder_suffix)
-        self.__placeholder_re = re.compile(INLINE_PLACEHOLDER % r'([0-9]{4})')
-
-    def __makePlaceholder(self, type):
-        """ Generate a placeholder """
-        id = "%04d" % len(self.stashed_nodes)
-        hash = INLINE_PLACEHOLDER % id
-        return hash, id
-
-    def __findPlaceholder(self, data, index):
-        """
-        Extract id from data string, start from index
-
-        Keyword arguments:
-
-        * data: string
-        * index: index, from which we start search
-
-        Returns: placeholder id and string index, after the found placeholder.
-        """
-
-        m = self.__placeholder_re.search(data, index)
-        if m:
-            return m.group(1), m.end()
-        else:
-            return None, index + 1
-
-    def __stashNode(self, node, type):
-        """ Add node to stash """
-        placeholder, id = self.__makePlaceholder(type)
-        self.stashed_nodes[id] = node
-        return placeholder
-
-    def __handleInline(self, data, patternIndex=0):
-        """
-        Process string with inline patterns and replace it
-        with placeholders
-
-        Keyword arguments:
-
-        * data: A line of Markdown text
-        * patternIndex: The index of the inlinePattern to start with
-
-        Returns: String with placeholders.
-
-        """
-        if not isinstance(data, AtomicString):
-            startIndex = 0
-            while patternIndex < len(self.__inlinePatterns):
-                data, matched, startIndex = self.__applyPattern(
-                                                 self.__inlinePatterns[patternIndex],
-                                                 data, patternIndex, startIndex)
-                if not matched:
-                    patternIndex += 1
-        return data
-
-    def __processElementText(self, node, subnode, isText=True):
-        """
-        Process placeholders in Element.text or Element.tail
-        of Elements popped from self.stashed_nodes.
-
-        Keywords arguments:
-
-        * node: parent node
-        * subnode: processing node
-        * isText: bool variable, True - it's text, False - it's tail
-
-        Returns: None
-
-        """
-        if isText:
-            text = subnode.text
-            subnode.text = None
-        else:
-            text = subnode.tail
-            subnode.tail = None
-
-        childResult = self.__processPlaceholders(text, subnode)
-
-        if not isText and node is not subnode:
-            pos = node.getchildren().index(subnode)
-            node.remove(subnode)
-        else:
-            pos = 0
-
-        childResult.reverse()
-        for newChild in childResult:
-            node.insert(pos, newChild)
-
-    def __processPlaceholders(self, data, parent):
-        """
-        Process string with placeholders and generate ElementTree tree.
-
-        Keyword arguments:
-
-        * data: string with placeholders instead of ElementTree elements.
-        * parent: Element, which contains processing inline data
-
-        Returns: list with ElementTree elements with applied inline patterns.
-        """
-        def linkText(text):
-            if text:
-                if result:
-                    if result[-1].tail:
-                        result[-1].tail += text
-                    else:
-                        result[-1].tail = text
-                else:
-                    if parent.text:
-                        parent.text += text
-                    else:
-                        parent.text = text
-
-        result = []
-        strartIndex = 0
-        while data:
-            index = data.find(self.__placeholder_prefix, strartIndex)
-            if index != -1:
-                id, phEndIndex = self.__findPlaceholder(data, index)
-
-                if self.stashed_nodes.has_key(id):
-                    node = self.stashed_nodes.get(id)
-
-                    if index > 0:
-                        text = data[strartIndex:index]
-                        linkText(text)
-
-                    if not isString(node): # it's Element
-                        for child in [node] + node.getchildren():
-                            if child.tail:
-                                if child.tail.strip():
-                                    self.__processElementText(node, child, False)
-                            if child.text:
-                                if child.text.strip():
-                                    self.__processElementText(child, child)
-                    else: # it's just a string
-                        linkText(node)
-                        strartIndex = phEndIndex
-                        continue
-
-                    strartIndex = phEndIndex
-                    result.append(node)
-
-                else: # wrong placeholder
-                    end = index + len(prefix)
-                    linkText(data[strartIndex:end])
-                    strartIndex = end
-            else:
-                text = data[strartIndex:]
-                linkText(text)
-                data = ""
-
-        return result
-
-    def __applyPattern(self, pattern, data, patternIndex, startIndex=0):
-        """
-        Check if the line fits the pattern, create the necessary
-        elements, add it to stashed_nodes.
-
-        Keyword arguments:
-
-        * data: the text to be processed
-        * pattern: the pattern to be checked
-        * patternIndex: index of current pattern
-        * startIndex: string index, from which we starting search
-
-        Returns: String with placeholders instead of ElementTree elements.
-
-        """
-        match = pattern.getCompiledRegExp().match(data[startIndex:])
-        leftData = data[:startIndex]
-
-        if not match:
-            return data, False, 0
-
-        node = pattern.handleMatch(match)
-
-        if node is None:
-            return data, True, len(leftData) + match.span(len(match.groups()))[0]
-
-        if not isString(node):
-            if not isinstance(node.text, AtomicString):
-                # We need to process current node too
-                for child in [node] + node.getchildren():
-                    if not isString(node):
-                        if child.text:
-                            child.text = self.__handleInline(child.text,
-                                                            patternIndex + 1)
-                        if child.tail:
-                            child.tail = self.__handleInline(child.tail,
-                                                            patternIndex)
-
-        placeholder = self.__stashNode(node, pattern.type())
-
-        return "%s%s%s%s" % (leftData,
-                             match.group(1),
-                             placeholder, match.groups()[-1]), True, 0
-
-    def applyInlinePatterns(self, markdownTree):
-        """Apply inline patterns to a parsed Markdown tree.
-
-        Iterate over ElementTree, find elements with inline tag, apply inline
-        patterns and append newly created Elements to tree.  If you don't
-        want process your data with inline paterns, instead of normal string,
-        use subclass AtomicString:
-
-            node.text = AtomicString("data won't be processed with inline patterns")
-
-        Arguments:
-
-        * markdownTree: ElementTree object, representing Markdown tree.
-
-        Returns: ElementTree object with applied inline patterns.
-
-        """
-        self.stashed_nodes = {}
-
-        stack = [markdownTree.getroot()]
-
-        while stack:
-            currElement = stack.pop()
-            insertQueue = []
-            for child in currElement.getchildren():
-                if child.text and not isinstance(child.text, AtomicString):
-                    text = child.text
-                    child.text = None
-                    lst = self.__processPlaceholders(self.__handleInline(
-                                                    text), child)
-                    stack += lst
-                    insertQueue.append((child, lst))
-
-                if child.getchildren():
-                    stack.append(child)
-
-            for element, lst in insertQueue:
-                if element.text:
-                    element.text = handleAttributes(element.text, element)
-                i = 0
-                for newChild in lst:
-                    # Processing attributes
-                    if newChild.tail:
-                        newChild.tail = handleAttributes(newChild.tail,
-                                                         element)
-                    if newChild.text:
-                        newChild.text = handleAttributes(newChild.text,
-                                                         newChild)
-                    element.insert(i, newChild)
-                    i += 1
-
-        return markdownTree
-
-
 """
 PRE-PROCESSORS
 =============================================================================
@@ -1375,132 +1104,373 @@ class ImagePattern(LinkPattern):
         el.set('alt', truealt)
         return el
 
-class ReferencePattern(LinkPattern):
-    """ Match to a stored reference and return link element. """
-    def handleMatch(self, m):
-        if m.group(9):
-            id = m.group(9).lower()
+class ReferencePattern(LinkPattern):
+    """ Match to a stored reference and return link element. """
+    def handleMatch(self, m):
+        if m.group(9):
+            id = m.group(9).lower()
+        else:
+            # if we got something like "[Google][]"
+            # we'll use "google" as the id
+            id = m.group(2).lower()
+
+        if not self.markdown.references.has_key(id): # ignore undefined refs
+            return None
+        href, title = self.markdown.references[id]
+
+        text = m.group(2)
+        return self.makeTag(href, title, text)
+
+    def makeTag(self, href, title, text):
+        el = etree.Element('a')
+
+        el.set('href', self.sanitize_url(href))
+        if title:
+            el.set('title', title)
+
+        el.text = text
+        return el
+
+
+class ImageReferencePattern (ReferencePattern):
+    """ Match to a stored reference and return img element. """
+    def makeTag(self, href, title, text):
+        el = etree.Element("img")
+        el.set("src", self.sanitize_url(href))
+        if title:
+            el.set("title", title)
+        el.set("alt", text)
+        return el
+
+
+class AutolinkPattern (Pattern):
+    """ Return a link Element given an autolink (`<http://example/com>`). """
+    def handleMatch(self, m):
+        el = etree.Element("a")
+        el.set('href', m.group(2))
+        el.text = AtomicString(m.group(2))
+        return el
+
+class AutomailPattern (Pattern):
+    """
+    Return a mailto link Element given an automail link (`<foo@example.com>`).
+    """
+    def handleMatch(self, m):
+        el = etree.Element('a')
+        email = m.group(2)
+        if email.startswith("mailto:"):
+            email = email[len("mailto:"):]
+
+        def codepoint2name(code):
+            """Return entity definition by code, or the code if not defined."""
+            entity = htmlentitydefs.codepoint2name.get(code)
+            if entity:
+                return "%s%s;" % (AMP_SUBSTITUTE, entity)
+            else:
+                return "%s#%d;" % (AMP_SUBSTITUTE, code)
+
+        letters = [codepoint2name(ord(letter)) for letter in email]
+        el.text = AtomicString(''.join(letters))
+
+        mailto = "mailto:" + email
+        mailto = "".join([AMP_SUBSTITUTE + '#%d;' %
+                          ord(letter) for letter in mailto])
+        el.set('href', mailto)
+        return el
+
+
+"""
+POST-PROCESSORS
+=============================================================================
+
+Markdown also allows post-processors, which are similar to preprocessors in
+that they need to implement a "run" method. However, they are run after core
+processing.
+
+There are two types of post-processors: Treeprocessor and Postprocessor
+"""
+
+class Treeprocessor(Processor):
+    """
+    Treeprocessors are run on the ElementTree object before serialization.
+
+    Each Treeprocessor implements a "run" method that takes a pointer to an
+    ElementTree, modifies it as necessary and returns an ElementTree
+    object.
+
+    Treeprocessors must extend markdown.Treeprocessor.
+
+    """
+    def run(self, root):
+        """
+        Subclasses of Treeprocessor should implement a `run` method, which
+        takes a root ElementTree. This method can return another ElementTree 
+        object, and the existing root ElementTree will be replaced, or it can 
+        modify the current tree and return None.
+        """
+        pass
+
+
+class InlineProcessor(Treeprocessor):
+    """
+    A Treeprocessor that traverses a tree, applying inline patterns.
+    """
+
+    def __init__ (self, md):
+        #self.__inlinePatterns = patterns
+        self.__placeholder_prefix = INLINE_PLACEHOLDER_PREFIX
+        self.__placeholder_suffix = ETX
+        self.__placeholder_length = 4 + len(self.__placeholder_prefix) \
+                                      + len(self.__placeholder_suffix)
+        self.__placeholder_re = re.compile(INLINE_PLACEHOLDER % r'([0-9]{4})')
+
+    def __makePlaceholder(self, type):
+        """ Generate a placeholder """
+        id = "%04d" % len(self.stashed_nodes)
+        hash = INLINE_PLACEHOLDER % id
+        return hash, id
+
+    def __findPlaceholder(self, data, index):
+        """
+        Extract id from data string, start from index
+
+        Keyword arguments:
+
+        * data: string
+        * index: index, from which we start search
+
+        Returns: placeholder id and string index, after the found placeholder.
+        """
+
+        m = self.__placeholder_re.search(data, index)
+        if m:
+            return m.group(1), m.end()
+        else:
+            return None, index + 1
+
+    def __stashNode(self, node, type):
+        """ Add node to stash """
+        placeholder, id = self.__makePlaceholder(type)
+        self.stashed_nodes[id] = node
+        return placeholder
+
+    def __handleInline(self, data, patternIndex=0):
+        """
+        Process string with inline patterns and replace it
+        with placeholders
+
+        Keyword arguments:
+
+        * data: A line of Markdown text
+        * patternIndex: The index of the inlinePattern to start with
+
+        Returns: String with placeholders.
+
+        """
+        if not isinstance(data, AtomicString):
+            startIndex = 0
+            while patternIndex < len(self.patterns):
+                data, matched, startIndex = self.__applyPattern(
+                                                 self.patterns[patternIndex],
+                                                 data, patternIndex, startIndex)
+                if not matched:
+                    patternIndex += 1
+        return data
+
+    def __processElementText(self, node, subnode, isText=True):
+        """
+        Process placeholders in Element.text or Element.tail
+        of Elements popped from self.stashed_nodes.
+
+        Keywords arguments:
+
+        * node: parent node
+        * subnode: processing node
+        * isText: bool variable, True - it's text, False - it's tail
+
+        Returns: None
+
+        """
+        if isText:
+            text = subnode.text
+            subnode.text = None
+        else:
+            text = subnode.tail
+            subnode.tail = None
+
+        childResult = self.__processPlaceholders(text, subnode)
+
+        if not isText and node is not subnode:
+            pos = node.getchildren().index(subnode)
+            node.remove(subnode)
         else:
-            # if we got something like "[Google][]"
-            # we'll use "google" as the id
-            id = m.group(2).lower()
+            pos = 0
 
-        if not self.markdown.references.has_key(id): # ignore undefined refs
-            return None
-        href, title = self.markdown.references[id]
+        childResult.reverse()
+        for newChild in childResult:
+            node.insert(pos, newChild)
 
-        text = m.group(2)
-        return self.makeTag(href, title, text)
+    def __processPlaceholders(self, data, parent):
+        """
+        Process string with placeholders and generate ElementTree tree.
 
-    def makeTag(self, href, title, text):
-        el = etree.Element('a')
+        Keyword arguments:
 
-        el.set('href', self.sanitize_url(href))
-        if title:
-            el.set('title', title)
+        * data: string with placeholders instead of ElementTree elements.
+        * parent: Element, which contains processing inline data
 
-        el.text = text
-        return el
+        Returns: list with ElementTree elements with applied inline patterns.
+        """
+        def linkText(text):
+            if text:
+                if result:
+                    if result[-1].tail:
+                        result[-1].tail += text
+                    else:
+                        result[-1].tail = text
+                else:
+                    if parent.text:
+                        parent.text += text
+                    else:
+                        parent.text = text
 
+        result = []
+        strartIndex = 0
+        while data:
+            index = data.find(self.__placeholder_prefix, strartIndex)
+            if index != -1:
+                id, phEndIndex = self.__findPlaceholder(data, index)
 
-class ImageReferencePattern (ReferencePattern):
-    """ Match to a stored reference and return img element. """
-    def makeTag(self, href, title, text):
-        el = etree.Element("img")
-        el.set("src", self.sanitize_url(href))
-        if title:
-            el.set("title", title)
-        el.set("alt", text)
-        return el
+                if self.stashed_nodes.has_key(id):
+                    node = self.stashed_nodes.get(id)
 
+                    if index > 0:
+                        text = data[strartIndex:index]
+                        linkText(text)
 
-class AutolinkPattern (Pattern):
-    """ Return a link Element given an autolink (`<http://example/com>`). """
-    def handleMatch(self, m):
-        el = etree.Element("a")
-        el.set('href', m.group(2))
-        el.text = AtomicString(m.group(2))
-        return el
+                    if not isString(node): # it's Element
+                        for child in [node] + node.getchildren():
+                            if child.tail:
+                                if child.tail.strip():
+                                    self.__processElementText(node, child, False)
+                            if child.text:
+                                if child.text.strip():
+                                    self.__processElementText(child, child)
+                    else: # it's just a string
+                        linkText(node)
+                        strartIndex = phEndIndex
+                        continue
 
-class AutomailPattern (Pattern):
-    """
-    Return a mailto link Element given an automail link (`<foo@example.com>`).
-    """
-    def handleMatch(self, m):
-        el = etree.Element('a')
-        email = m.group(2)
-        if email.startswith("mailto:"):
-            email = email[len("mailto:"):]
+                    strartIndex = phEndIndex
+                    result.append(node)
 
-        def codepoint2name(code):
-            """Return entity definition by code, or the code if not defined."""
-            entity = htmlentitydefs.codepoint2name.get(code)
-            if entity:
-                return "%s%s;" % (AMP_SUBSTITUTE, entity)
+                else: # wrong placeholder
+                    end = index + len(prefix)
+                    linkText(data[strartIndex:end])
+                    strartIndex = end
             else:
-                return "%s#%d;" % (AMP_SUBSTITUTE, code)
+                text = data[strartIndex:]
+                linkText(text)
+                data = ""
 
-        letters = [codepoint2name(ord(letter)) for letter in email]
-        el.text = AtomicString(''.join(letters))
+        return result
 
-        mailto = "mailto:" + email
-        mailto = "".join([AMP_SUBSTITUTE + '#%d;' %
-                          ord(letter) for letter in mailto])
-        el.set('href', mailto)
-        return el
+    def __applyPattern(self, pattern, data, patternIndex, startIndex=0):
+        """
+        Check if the line fits the pattern, create the necessary
+        elements, add it to stashed_nodes.
 
+        Keyword arguments:
 
-"""
-POST-PROCESSORS
-=============================================================================
+        * data: the text to be processed
+        * pattern: the pattern to be checked
+        * patternIndex: index of current pattern
+        * startIndex: string index, from which we starting search
 
-Markdown also allows post-processors, which are similar to preprocessors in
-that they need to implement a "run" method. However, they are run after core
-processing.
+        Returns: String with placeholders instead of ElementTree elements.
 
-There are two types of post-processors: Treeprocessor and Postprocessor
-"""
+        """
+        match = pattern.getCompiledRegExp().match(data[startIndex:])
+        leftData = data[:startIndex]
 
-class Treeprocessor(Processor):
-    """
-    Treeprocessors are run on the ElementTree object before serialization.
+        if not match:
+            return data, False, 0
 
-    Each Treeprocessor implements a "run" method that takes a pointer to an
-    ElementTree, modifies it as necessary and returns an ElementTree
-    object.
+        node = pattern.handleMatch(match)
 
-    Treeprocessors must extend markdown.Treeprocessor.
+        if node is None:
+            return data, True, len(leftData) + match.span(len(match.groups()))[0]
 
-    """
-    def run(self, root):
-        """
-        Subclasses of Treeprocessor should implement a `run` method, which
-        takes a root ElementTree. This method can return another ElementTree 
-        object, and the existing root ElementTree will be replaced, or it can 
-        modify the current tree and return None.
-        """
-        pass
+        if not isString(node):
+            if not isinstance(node.text, AtomicString):
+                # We need to process current node too
+                for child in [node] + node.getchildren():
+                    if not isString(node):
+                        if child.text:
+                            child.text = self.__handleInline(child.text,
+                                                            patternIndex + 1)
+                        if child.tail:
+                            child.tail = self.__handleInline(child.tail,
+                                                            patternIndex)
 
+        placeholder = self.__stashNode(node, pattern.type())
 
-class Postprocessor(Processor):
-    """
-    Postprocessors are run after the ElementTree it converted back into text.
+        return "%s%s%s%s" % (leftData,
+                             match.group(1),
+                             placeholder, match.groups()[-1]), True, 0
 
-    Each Postprocessor implements a "run" method that takes a pointer to a
-    text string, modifies it as necessary and returns a text string.
+    def run(self, tree):
+        """Apply inline patterns to a parsed Markdown tree.
 
-    Postprocessors must extend markdown.Postprocessor.
+        Iterate over ElementTree, find elements with inline tag, apply inline
+        patterns and append newly created Elements to tree.  If you don't
+        want process your data with inline paterns, instead of normal string,
+        use subclass AtomicString:
 
-    """
+            node.text = AtomicString("data won't be processed with inline patterns")
 
-    def run(self, text):
-        """
-        Subclasses of Postprocessor should implement a `run` method, which
-        takes the html document as a single text string and returns a
-        (possibly modified) string.
+        Arguments:
+
+        * markdownTree: ElementTree object, representing Markdown tree.
+
+        Returns: ElementTree object with applied inline patterns.
 
         """
-        pass
+        self.stashed_nodes = {}
+
+        stack = [tree]
+
+        while stack:
+            currElement = stack.pop()
+            insertQueue = []
+            for child in currElement.getchildren():
+                if child.text and not isinstance(child.text, AtomicString):
+                    text = child.text
+                    child.text = None
+                    lst = self.__processPlaceholders(self.__handleInline(
+                                                    text), child)
+                    stack += lst
+                    insertQueue.append((child, lst))
+
+                if child.getchildren():
+                    stack.append(child)
+
+            for element, lst in insertQueue:
+                if element.text:
+                    element.text = handleAttributes(element.text, element)
+                i = 0
+                for newChild in lst:
+                    # Processing attributes
+                    if newChild.tail:
+                        newChild.tail = handleAttributes(newChild.tail,
+                                                         element)
+                    if newChild.text:
+                        newChild.text = handleAttributes(newChild.text,
+                                                         newChild)
+                    element.insert(i, newChild)
+                    i += 1
+
+        return tree
 
 
 class PrettifyTreeprocessor(Treeprocessor):
@@ -1533,6 +1503,28 @@ class PrettifyTreeprocessor(Treeprocessor):
                 br.tail = '\n%s' % br.tail
 
 
+class Postprocessor(Processor):
+    """
+    Postprocessors are run after the ElementTree it converted back into text.
+
+    Each Postprocessor implements a "run" method that takes a pointer to a
+    text string, modifies it as necessary and returns a text string.
+
+    Postprocessors must extend markdown.Postprocessor.
+
+    """
+
+    def run(self, text):
+        """
+        Subclasses of Postprocessor should implement a `run` method, which
+        takes the html document as a single text string and returns a
+        (possibly modified) string.
+
+        """
+        pass
+
+
+
 class RawHtmlPostprocessor(Postprocessor):
     """ Restore raw html to the document. """
 
@@ -1801,6 +1793,7 @@ class Markdown:
         # footnote preprocessor will be inserted with "<reference"
 
         self.treeprocessors = Treap()
+        self.treeprocessors.add("inline", InlineProcessor(self))
         self.treeprocessors.add("prettify", PrettifyTreeprocessor(self))
 
         self.postprocessors = Treap()
@@ -1841,6 +1834,9 @@ class Markdown:
                                 configs = extension_configs)
         self.reset()
 
+        # Sort and add patterns only after all extensions are loaded.
+        self.treeprocessors['inline'].patterns = self.inlinePatterns.heapsorted()
+
     def registerExtensions(self, extensions, configs):
         """
         Register extensions with this instance of Markdown.
@@ -1915,11 +1911,11 @@ class Markdown:
             self.lines = prep.run(self.lines)
 
         # Parse the high-level elements.
-        tree = self.parser.parseDocument(self.lines)
+        root = self.parser.parseDocument(self.lines).getroot()
 
         # Apply inline patterns
-        inlineProcessor = InlineProcessor(self.inlinePatterns.heapsorted())
-        root = inlineProcessor.applyInlinePatterns(tree).getroot()
+        #inlineProcessor = InlineProcessor(self.inlinePatterns.heapsorted())
+        #root = inlineProcessor.applyInlinePatterns(tree).getroot()
 
         # Run the tree-processors
         for treeprocessor in self.treeprocessors.heapsorted():
-- 
cgit v1.2.3