From 8b6b7b0a39321dadfcab4d0a16053377c4715bee Mon Sep 17 00:00:00 2001
From: Yuri Takhteyev <yuri@freewisdom.org>
Date: Sun, 12 Oct 2008 19:37:20 -0700
Subject: Refactored markdown tree traversing logic into a separate class
 (InlineProcessor).

---
 markdown.py | 391 +++++++++++++++++++++++++++++-------------------------------
 1 file changed, 189 insertions(+), 202 deletions(-)

(limited to 'markdown.py')

diff --git a/markdown.py b/markdown.py
index 0aa530d..ae8dc10 100755
--- a/markdown.py
+++ b/markdown.py
@@ -1401,28 +1401,32 @@ class HtmlStash:
         self.html_counter += 1
         return placeholder
     
-    def rest(self):
+    def reset(self):
         self.html_counter = 0
         self.rawHtmlBlocks = []
 
-    
-class InlineStash:
-    
-    def __init__(self):
-        """ Create a InlineStash. """
-        self.prefix = INLINE_PLACEHOLDER_PREFIX
-        self.suffix = ETX
-        self._nodes = {}
-        self.phLength = 4 + len(self.prefix) + len(self.suffix)
-        self._placeholder_re = re.compile(INLINE_PLACEHOLDER % r'([0-9]{4})')
+
+class InlineProcessor:
+    """
+    An auxiliary class to traverse a Markdown tree, applying inline patterns.
+    """
+
+    def __init__ (self, patterns):
+        self.inlinePatterns = patterns
+
+        self.__placeholder_prefix = INLINE_PLACEHOLDER_PREFIX
+        self.__placeholder_suffix = ETX
+        self.__placeholder_length = 4 + len(self.__placeholder_prefix) \
+                                      + len(self.__placeholder_suffix)
+        self.__placeholder_re = re.compile(INLINE_PLACEHOLDER % r'([0-9]{4})')
         
-    def _genPlaceholder(self, type):
+    def __makePlaceholder(self, type):
         """ Generate a placeholder """
-        id = "%04d" % len(self._nodes)
+        id = "%04d" % len(self.stashed_nodes)
         hash = INLINE_PLACEHOLDER % id 
         return hash, id
     
-    def extractId(self, data, index):
+    def __findPlaceholder(self, data, index):
         """ 
         Extract id from data string, start from index
         
@@ -1434,164 +1438,19 @@ class InlineStash:
         Returns: placeholder id and  string index, after 
         found placeholder
         """
-        m = self._placeholder_re.search(data, index)
+        m = self.__placeholder_re.search(data, index)
         if m:
             return m.group(1), m.end()
         else:
             return None, index + 1 
     
-    def isin(self, id):
-        """ Check if node with given id exists in stash """
-        return self._nodes.has_key(id)
-    
-    def get(self, id):
-        """ Return node by id """
-        return self._nodes.get(id)
-    
-    def add(self, node, type):
+    def __stashNode(self, node, type):
         """ Add node to stash """
-        pholder, id = self._genPlaceholder(type)
-        self._nodes[id] = node
-        return pholder
-    
-    def rest(self):
-        """ Reset instance """
-        self._nodes = {}
+        placeholder, id = self.__makePlaceholder(type)
+        self.stashed_nodes[id] = node
+        return placeholder
     
-           
-
-class Markdown:
-    """Convert Markdown to HTML."""
-
-    def __init__(self, 
-                 extensions=[],
-                 extension_configs={},
-                 safe_mode = False):
-        """
-        Creates a new Markdown instance.
-
-        Keyword arguments:
-        
-        * extensions: A list of extensions.  
-           If they are of type string, the module mdx_name.py will be loaded.  
-           If they are a subclass of markdown.Extension, they will be used 
-           as-is.
-        * extension-configs: Configuration setting for extensions.
-        * safe_mode: Disallow raw html. One of "remove", "replace" or "escape".
-        
-        """
-        self.parser = MarkdownParser()
-        self.safeMode = safe_mode
-        self.registeredExtensions = []
-        self.docType = ""
-        self.stripTopLevelTags = True
-
-        self.textPreprocessors = [HTML_BLOCK_PREPROCESSOR]
-
-        self.preprocessors = [HEADER_PREPROCESSOR,
-                              LINE_PREPROCESSOR,
-                              # A footnote preprocessor will
-                              # get inserted here
-                              REFERENCE_PREPROCESSOR]
-
-
-        self.postprocessors = [PRETTIFYPOSTPROCESSOR,
-                               # a footnote postprocessor will get
-                               # inserted later
-                               ]
-
-        self.textPostprocessors = [# a footnote postprocessor will get
-                                   # inserted here
-                                   RAWHTMLTEXTPOSTPROCESSOR,
-                                   AMPSUBSTITUTETEXTPOSTPROCESSOR]
-
-        self.prePatterns = []
-                               
-        self.inlinePatterns = [
-                               BACKTICK_PATTERN,
-                               ESCAPE_PATTERN,
-                               REFERENCE_PATTERN,
-                               LINK_PATTERN,
-                               IMAGE_LINK_PATTERN,
-                               IMAGE_REFERENCE_PATTERN,
-                               AUTOLINK_PATTERN,
-                               AUTOMAIL_PATTERN,
-                               LINE_BREAK_PATTERN_2,
-                               LINE_BREAK_PATTERN,
-                               HTML_PATTERN,
-                               ENTITY_PATTERN,
-                               NOT_STRONG_PATTERN,
-                               STRONG_EM_PATTERN,
-                               STRONG_PATTERN,
-                               EMPHASIS_PATTERN,
-                               EMPHASIS_PATTERN_2
-                               # The order of the handlers matters!!!
-                               ]
-        
-        self.inlineStash = InlineStash()
-        self.references = {}
-        self.htmlStash = HtmlStash()
-
-
-        self.registerExtensions(extensions = extensions,
-                                configs = extension_configs)
-
-        self.reset()
-
-
-    def registerExtensions(self, extensions, configs):
-        """ 
-        Register extensions with this instance of Markdown.
-
-        Keyword aurguments:
-        
-        * extensions: A list of extensions, which can either
-           be strings or objects.  See the docstring on Markdown.
-        * configs: A dictionary mapping module names to config options. 
-        
-        """
-        for ext in extensions:
-            if isinstance(ext, basestring):
-                ext = load_extension(ext, configs.get(ext, []))
-            elif hasattr(ext, 'extendMarkdown'):
-                # Looks like an Extension.
-                # Nothing to do here.
-                pass
-            else:
-                message(ERROR, "Incorrect type! Extension '%s' is "
-                               "neither a string or an Extension." %(repr(ext)))
-                continue
-            ext.extendMarkdown(self, globals())
-
-    def registerExtension(self, extension):
-        """ This gets called by the extension """
-        self.registeredExtensions.append(extension)
-
-    def reset(self):
-        """
-        Resets all state variables so that we can start with a new text.
-        """
-        self.inlineStash.rest()
-        self.htmlStash.rest()
-        self.references.clear()
-
-        HTML_BLOCK_PREPROCESSOR.stash = self.htmlStash
-        LINE_PREPROCESSOR.stash = self.htmlStash
-        REFERENCE_PREPROCESSOR.references = self.references
-        HTML_PATTERN.stash = self.htmlStash
-        ENTITY_PATTERN.stash = self.htmlStash
-        REFERENCE_PATTERN.references = self.references
-        IMAGE_REFERENCE_PATTERN.references = self.references
-        RAWHTMLTEXTPOSTPROCESSOR.stash = self.htmlStash
-        RAWHTMLTEXTPOSTPROCESSOR.safeMode = self.safeMode
-
-        for extension in self.registeredExtensions:
-            extension.reset()
-
-        for pattern in self.inlinePatterns:
-            pattern.safe_mode = self.safeMode
-
-    def _handleInline(self, data, patternIndex=0):
+    def __handleInline(self, data, patternIndex=0):
         """
         Process string with inline patterns and replace it
         with placeholders
@@ -1604,23 +1463,20 @@ class Markdown:
         Returns: String with placeholders. 
         
         """
-        if isinstance(data, AtomicString):
-            return data
-
-        startIndex = 0        
-        while patternIndex < len(self.inlinePatterns):
-            data, matched, startIndex = self._applyInline(
-                                             self.inlinePatterns[patternIndex],
-                                             data, patternIndex, startIndex)
-            if not matched:
-                patternIndex += 1
+        if not isinstance(data, AtomicString):
+            startIndex = 0        
+            while patternIndex < len(self.inlinePatterns):
+                data, matched, startIndex = self.__applyPattern(
+                                                 self.inlinePatterns[patternIndex],
+                                                 data, patternIndex, startIndex)
+                if not matched:
+                    patternIndex += 1
         return data
 
-
-    def _processElementText(self, node, subnode, isText=True):
+    def __processElementText(self, node, subnode, isText=True):
         """
         Process placeholders in Element.text or Element.tail
-        of Elements popped from InlineStash
+        of Elements popped from self.stashed_nodes.
         
         Keywords arguments:
         
@@ -1638,7 +1494,7 @@ class Markdown:
             text = subnode.tail
             subnode.tail = None
         
-        childResult = self._processPlaceholders(text, subnode)
+        childResult = self.__processPlaceholders(text, subnode)
         
         if not isText and node is not subnode:
             pos = node.getchildren().index(subnode)
@@ -1650,7 +1506,7 @@ class Markdown:
         for newChild in childResult:
             node.insert(pos, newChild)
     
-    def _processPlaceholders(self, data, parent):
+    def __processPlaceholders(self, data, parent):
         """
         Process string with placeholders and generate ElementTree tree.
         
@@ -1675,16 +1531,14 @@ class Markdown:
                         parent.text = text
             
         result = []
-        prefix = self.inlineStash.prefix
-        strartIndex = 0
-    
+        strartIndex = 0    
         while data:
-            index = data.find(prefix, strartIndex)
+            index = data.find(self.__placeholder_prefix, strartIndex)
             if index != -1:
-                id, phEndIndex = self.inlineStash.extractId(data, index)
+                id, phEndIndex = self.__findPlaceholder(data, index)
 
-                if self.inlineStash.isin(id):
-                    node = self.inlineStash.get(id)
+                if self.stashed_nodes.has_key(id):
+                    node = self.stashed_nodes.get(id)
              
                     if index > 0:
                         text = data[strartIndex:index]
@@ -1694,10 +1548,10 @@ class Markdown:
                         for child in [node] + node.getchildren():
                             if child.tail:
                                 if child.tail.strip():
-                                    self._processElementText(node, child, False)
+                                    self.__processElementText(node, child, False)
                             if child.text:
                                 if child.text.strip():
-                                    self._processElementText(child, child)
+                                    self.__processElementText(child, child)
                     else: # it's just a string
                         linkText(node)
                         strartIndex = phEndIndex
@@ -1718,10 +1572,10 @@ class Markdown:
         return result
 
     
-    def _applyInline(self, pattern, data, patternIndex, startIndex=0):
+    def __applyPattern(self, pattern, data, patternIndex, startIndex=0):
         """ 
         Check if the line fits the pattern, create the necessary 
-        elements, add it to InlineStash
+        elements, add it to stashed_nodes.
         
         Keyword arguments:
         
@@ -1749,17 +1603,17 @@ class Markdown:
                 for child in [node] + node.getchildren():
                     if not isString(node):
                         if child.text:
-                            child.text = self._handleInline(child.text, 
+                            child.text = self.__handleInline(child.text, 
                                                             patternIndex + 1)
                         if child.tail:
-                            child.tail = self._handleInline(child.tail, 
+                            child.tail = self.__handleInline(child.tail, 
                                                             patternIndex)
    
-        pholder = self.inlineStash.add(node, pattern.type())
+        placeholder = self.__stashNode(node, pattern.type())
 
         return "%s%s%s%s" % (leftData, 
                              match.group(1), 
-                             pholder, match.groups()[-1]), True, 0
+                             placeholder, match.groups()[-1]), True, 0
 
     
     def applyInlinePatterns(self, markdownTree):
@@ -1777,6 +1631,8 @@ class Markdown:
 
         Returns: ElementTree object with applied inline patterns.
         """
+        self.stashed_nodes = {}
+
         stack = [markdownTree.getroot()]
 
         while stack:
@@ -1786,7 +1642,7 @@ class Markdown:
                 if child.text and not isinstance(child.text, AtomicString):
                     text = child.text
                     child.text = None
-                    lst = self._processPlaceholders(self._handleInline(
+                    lst = self.__processPlaceholders(self.__handleInline(
                                                     text), child)
                     stack += lst
                     insertQueue.append((child, lst))
@@ -1811,6 +1667,139 @@ class Markdown:
                
         return markdownTree
 
+
+           
+
+class Markdown:
+    """Convert Markdown to HTML."""
+
+    def __init__(self, 
+                 extensions=[],
+                 extension_configs={},
+                 safe_mode = False):
+        """
+        Creates a new Markdown instance.
+
+        Keyword arguments:
+        
+        * extensions: A list of extensions.  
+           If they are of type string, the module mdx_name.py will be loaded.  
+           If they are a subclass of markdown.Extension, they will be used 
+           as-is.
+        * extension-configs: Configuration setting for extensions.
+        * safe_mode: Disallow raw html. One of "remove", "replace" or "escape".
+        
+        """
+        self.parser = MarkdownParser()
+        self.safeMode = safe_mode
+        self.registeredExtensions = []
+        self.docType = ""
+        self.stripTopLevelTags = True
+
+        self.textPreprocessors = [HTML_BLOCK_PREPROCESSOR]
+
+        self.preprocessors = [HEADER_PREPROCESSOR,
+                              LINE_PREPROCESSOR,
+                              # A footnote preprocessor will
+                              # get inserted here
+                              REFERENCE_PREPROCESSOR]
+
+
+        self.postprocessors = [PRETTIFYPOSTPROCESSOR,
+                               # a footnote postprocessor will get
+                               # inserted later
+                               ]
+
+        self.textPostprocessors = [# a footnote postprocessor will get
+                                   # inserted here
+                                   RAWHTMLTEXTPOSTPROCESSOR,
+                                   AMPSUBSTITUTETEXTPOSTPROCESSOR]
+
+        self.prePatterns = []
+                               
+        self.inlinePatterns = [
+                               BACKTICK_PATTERN,
+                               ESCAPE_PATTERN,
+                               REFERENCE_PATTERN,
+                               LINK_PATTERN,
+                               IMAGE_LINK_PATTERN,
+                               IMAGE_REFERENCE_PATTERN,
+                               AUTOLINK_PATTERN,
+                               AUTOMAIL_PATTERN,
+                               LINE_BREAK_PATTERN_2,
+                               LINE_BREAK_PATTERN,
+                               HTML_PATTERN,
+                               ENTITY_PATTERN,
+                               NOT_STRONG_PATTERN,
+                               STRONG_EM_PATTERN,
+                               STRONG_PATTERN,
+                               EMPHASIS_PATTERN,
+                               EMPHASIS_PATTERN_2
+                               # The order of the handlers matters!!!
+                               ]
+        
+        self.inlineProcessor = InlineProcessor(self.inlinePatterns)
+        self.references = {}
+        self.htmlStash = HtmlStash()
+
+
+        self.registerExtensions(extensions = extensions,
+                                configs = extension_configs)
+
+        self.reset()
+
+
+    def registerExtensions(self, extensions, configs):
+        """ 
+        Register extensions with this instance of Markdown.
+
+        Keyword aurguments:
+        
+        * extensions: A list of extensions, which can either
+           be strings or objects.  See the docstring on Markdown.
+        * configs: A dictionary mapping module names to config options. 
+        
+        """
+        for ext in extensions:
+            if isinstance(ext, basestring):
+                ext = load_extension(ext, configs.get(ext, []))
+            elif hasattr(ext, 'extendMarkdown'):
+                # Looks like an Extension.
+                # Nothing to do here.
+                pass
+            else:
+                message(ERROR, "Incorrect type! Extension '%s' is "
+                               "neither a string or an Extension." %(repr(ext)))
+                continue
+            ext.extendMarkdown(self, globals())
+
+    def registerExtension(self, extension):
+        """ This gets called by the extension """
+        self.registeredExtensions.append(extension)
+
+    def reset(self):
+        """
+        Resets all state variables so that we can start with a new text.
+        """
+        self.htmlStash.reset()
+        self.references.clear()
+
+        HTML_BLOCK_PREPROCESSOR.stash = self.htmlStash
+        LINE_PREPROCESSOR.stash = self.htmlStash
+        REFERENCE_PREPROCESSOR.references = self.references
+        HTML_PATTERN.stash = self.htmlStash
+        ENTITY_PATTERN.stash = self.htmlStash
+        REFERENCE_PATTERN.references = self.references
+        IMAGE_REFERENCE_PATTERN.references = self.references
+        RAWHTMLTEXTPOSTPROCESSOR.stash = self.htmlStash
+        RAWHTMLTEXTPOSTPROCESSOR.safeMode = self.safeMode
+
+        for extension in self.registeredExtensions:
+            extension.reset()
+
+        for pattern in self.inlinePatterns:
+            pattern.safe_mode = self.safeMode
+
     def convert (self, source):
         """Convert markdown to serialized XHTML."""
 
@@ -1823,10 +1812,8 @@ class Markdown:
             message(CRITICAL, 'UnicodeDecodeError: Markdown only accepts unicode or ascii input.')
             return u""
 
-        source = source.replace(STX, "")
-        source = source.replace(ETX, "")
-        source = source.replace("\r\n", "\n").replace("\r", "\n")
-        source += "\n\n"
+        source = source.replace(STX, "").replace(ETX, "")
+        source = source.replace("\r\n", "\n").replace("\r", "\n") + "\n\n"
         source = source.expandtabs(TAB_LENGTH)
 
         # Run the text preprocessors
@@ -1841,7 +1828,8 @@ class Markdown:
         # Parse the high-level elements.
         tree = self.parser.parseDocument(self.lines)
 
-        root = self.applyInlinePatterns(tree).getroot()
+        # Apply inline patterns
+        root = self.inlineProcessor.applyInlinePatterns(tree).getroot()
 
         # Run the post-processors
         for postprocessor in self.postprocessors:
@@ -2042,7 +2030,6 @@ def markdownFromFile(input = None,
                      encoding = None,
                      safe = False):
 
-
     md = Markdown(extensions=load_extensions(extensions),
                   safe_mode = safe_mode)
     md.convertFile(input, output, encoding)
-- 
cgit v1.2.3