All sorts of cleanup.

The bigger changes include getting rid of old BOM-removal logic and getting rid of BlockGuru. Most of the changes are just re-ordering of functions, removal of whitespace, adding comments, etc.
author: Yuri Takhteyev <yuri@freewisdom.org> 2008-10-07 01:32:56 -0700
committer: Yuri Takhteyev <yuri@freewisdom.org> 2008-10-07 01:32:56 -0700
commit: 40b8986ccf0ea3fa37dda469b46261dfbf0c25a4 (patch)
tree: c29877335eea157e1bfcea87c7332e1ab4f6ff6a
parent: 6d719bd60b31e7fad3aae345a30f2820e2fd6215 (diff)
download: markdown-40b8986ccf0ea3fa37dda469b46261dfbf0c25a4.tar.gz
markdown-40b8986ccf0ea3fa37dda469b46261dfbf0c25a4.tar.bz2
markdown-40b8986ccf0ea3fa37dda469b46261dfbf0c25a4.zip
2 files changed, 292 insertions, 397 deletions
diff --git a/markdown.py b/markdown.py
index 52f278e..95bf61d 100755
--- a/markdown.py
+++ b/markdown.py
@@ -42,174 +42,132 @@ License: BSD (see docs/LICENSE for details).
 version = "2.0-alpha"
 version_info = (2,0,0, "beta")
 
-import re, sys, codecs, htmlentitydefs
+import re
+import sys
+import codecs
+import htmlentitydefs
 import logging
 from logging import DEBUG, INFO, WARN, ERROR, CRITICAL
 from urlparse import urlparse, urlunparse
 
 
-# --------------- Constants you might want to modify ------------------------
-COMMAND_LINE_LOGGING_LEVEL = CRITICAL
-TAB_LENGTH = 4            # expand tabs to this many spaces
-ENABLE_ATTRIBUTES = True  # @id = xyz -> <... id="xyz">
+"""
+CONSTANTS
+=============================================================================
+"""
+
+"""
+Constants you might want to modify
+-----------------------------------------------------------------------------
+"""
+
+# default logging level for command-line use
+COMMAND_LINE_LOGGING_LEVEL = CRITICAL  
+TAB_LENGTH = 4               # expand tabs to this many spaces
+ENABLE_ATTRIBUTES = True     # @id = xyz -> <... id="xyz">
 SMART_EMPHASIS = True        # this_or_that does not become this<i>or</i>that
 HTML_REMOVED_TEXT = "[HTML_REMOVED]" # text used instead of HTML in safe mode
+BLOCK_LEVEL_ELEMENTS = re.compile("p|div|h[1-6]|blockquote|pre|table|dl|ol|ul"
+                                  +"|script|noscript|form|fieldset|iframe|math"
+                                  +"|ins|del|hr|hr/|style|li|tr")
+
+"""
+Constants you probably do not need to change
+-----------------------------------------------------------------------------
+"""
 
+RTL_BIDI_RANGES = ( (u'\u0590', u'\u07FF'),
+                     # Hebrew (0590-05FF), Arabic (0600-06FF),
+                     # Syriac (0700-074F), Arabic supplement (0750-077F),
+                     # Thaana (0780-07BF), Nko (07C0-07FF).
+                    (u'\u2D30', u'\u2D7F'), # Tifinagh
+                    )
 
-# --------------- Auxiliary functions ---------------------------------------
+EXECUTABLE_NAME_FOR_USAGE = "python markdown.py"
+""" The name used in the usage statement displayed for python versions < 2.3.
+(With python 2.3 and higher the usage statement is generated by optparse
+and uses the actual name of the executable called.) """
+
+# Placeholders
+STX = u'\u0002'  # Use STX ("Start of text") for start-of-placeholder
+ETX = u'\u0003'  # Use ETX ("End of text") for end-of-placeholder
+HTML_PLACEHOLDER_PREFIX = STX+"wzxhzdk:"
+HTML_PLACEHOLDER = HTML_PLACEHOLDER_PREFIX + "%d" + ETX
+INLINE_PLACEHOLDER_PREFIX = STX+"klzzwxh:"
+INLINE_PLACEHOLDER = INLINE_PLACEHOLDER_PREFIX + "%s" + ETX
+AMP_SUBSTITUTE = STX+"amp"+ETX 
+
+
+"""
+AUXILIARY GLOBAL FUNCTIONS
+=============================================================================
+"""
 
 def message(level, text):
-    ''' A wrapper method for logging debug messages. '''
+    """ A wrapper method for logging debug messages. """
     logging.getLogger('MARKDOWN').log(level, text)
     
-def isstr(s):
+def isString(s):
     """ Check if it's string """
     return isinstance(s, unicode) or isinstance(s, str)
 
 ## Import 
 def importETree(): 
-    """ Import best variant of ElementTree and return module object """
-    cetree = None  
-    try:
-        # Python 2.5+
-        import xml.etree.cElementTree as cetree
+    """Import the best implementation of ElementTree, return a module object."""
+    etree_in_c = None  
+    try: # Is it Python 2.5+ with C implemenation of ElementTree installed?
+        import xml.etree.cElementTree as etree_in_c
     except ImportError:
-        try:
-            # Python 2.5+
+        try: # Is it Python 2.5+ with Python implementation of ElementTree?
             import xml.etree.ElementTree as etree
         except ImportError:
-            try:
-                # normal cElementTree install
-                import cElementTree as cetree
+            try: # An earlier version of Python with cElementTree installed?
+                import cElementTree as etree_in_c
             except ImportError:
-                try:
-                    # normal ElementTree install
+                try: # An earlier version of Python with Python ElementTree?
                     import elementtree.ElementTree as etree
                 except ImportError:
-                    message(CRITICAL, 
-                           "Failed to import ElementTree from any known place")
+                    message(CRITICAL, "Failed to import ElementTree")
                     sys.exit(1)
-    if cetree:
-        if cetree.VERSION < "1.0":
-            message(CRITICAL, 
-                           "cElementTree version is too old, 1.0 and upper required")
-            sys.exit(1)
-            
-        etree = cetree
-    else:
-        if etree.VERSION < "1.1":
-            message(CRITICAL, 
-                           "ElementTree version is too old, 1.1 and upper required")
-            sys.exit(1)
-            
-    return etree
-
-"""ElementTree module
-in extensions use: `from markdown import etree`
-to access to the ElemetTree module, do not import it by yourself"""
-etree = importETree() 
-
-RTL_BIDI_RANGES = ( (u'\u0590', u'\u07FF'),
-                    # from Hebrew to Nko (includes Arabic, Syriac and Thaana)
-                    (u'\u2D30', u'\u2D7F'),
-                    # Tifinagh
-                    )
-
-# Unicode Reference Table:
-# 0590-05FF - Hebrew
-# 0600-06FF - Arabic
-# 0700-074F - Syriac
-# 0750-077F - Arabic Supplement
-# 0780-07BF - Thaana
-# 07C0-07FF - Nko
-
-BOMS = { 'utf-8': (codecs.BOM_UTF8, ),
-         'utf-16': (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE),
-         #'utf-32': (codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE)
-         }
-
-def removeBOM(text, encoding):
-    """
-    Used by `markdownFromFile` to remove a "byte order mark" from the begining
-    of an utf-8, utf-16 or utf-32 encoded file.
-    """
+    if etree_in_c and etree_in_c.VERSION < "1.0":
+        message(CRITICAL, "For cElementTree version 1.0 or higher is required.")
+        sys.exit(1)
+    elif etree_in_c :
+        return etree_in_c
+    elif etree.VERSION < "1.1":
+        message(CRITICAL, "For ElementTree version 1.1 or higher is required")
+        sys.exit(1)
+    else :        
+        return etree
     
-    convert = isinstance(text, unicode)
-    for bom in BOMS[encoding]:
-        bom = convert and bom.decode(encoding) or bom
-        if text.startswith(bom):
-            return text.lstrip(bom)
-    return text
-
-
-# The following constant specifies the name used in the usage
-# statement displayed for python versions lower than 2.3.  (With
-# python2.3 and higher the usage statement is generated by optparse
-# and uses the actual name of the executable called.)
-
-EXECUTABLE_NAME_FOR_USAGE = "python markdown.py"
-                    
-
-# --------------- CONSTANTS YOU _SHOULD NOT_ HAVE TO CHANGE ----------
-
-
-# placeholders
-STX = u'\u0002'  # Use STX ("Start of text") for start-of-placeholder
-ETX = u'\u0003'  # Use ETX ("End of text") for end-of-placeholder
-HTML_PLACEHOLDER_PREFIX = STX+"wzxhzdk:"
-HTML_PLACEHOLDER = HTML_PLACEHOLDER_PREFIX + "%d" + ETX
-INLINE_PLACEHOLDER_PREFIX = STX+"klzzwxh:"
-INLINE_PLACEHOLDER = INLINE_PLACEHOLDER_PREFIX + "%s" + ETX
-
-AMP_SUBSTITUTE = STX+"amp"+ETX 
-
-BLOCK_LEVEL_ELEMENTS = re.compile('p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del|hr|hr/|style|li|tr')
-
 def isBlockLevel(tag):
-    """
-    Used by HTMLBlockPreprocessor to check if a given tag is a block level 
-    element.
-    """
+    """Check if the tag is a block level HTML tag."""
     return BLOCK_LEVEL_ELEMENTS.match(tag)
 
-
-def codepoint2name(code):
-    """ 
-    Return entity definition by code, or code 
-    if there is no such entity definition
-    """
-    entity = htmlentitydefs.codepoint2name.get(code)
-    if entity:
-        return "%s%s;" % (AMP_SUBSTITUTE, entity)
-    else:
-        return "%s#%d;" % (AMP_SUBSTITUTE, code)
-    
 def handleAttributes(text, parent):
-    """ Handale attributes, e.g {@id=123} """
+    """Set values of an element based on attribute definitions ({@id=123})."""
     def attributeCallback(match):
         parent.set(match.group(1), match.group(2))
+    return CORE_RE['attr'].sub(attributeCallback, text)
 
-    return RE.regExp['attr'].sub(attributeCallback, text)
-    
-
-class AtomicString(unicode):
-    "A string which should not be further processed."
-    pass
+def dequote(string):
+    """Remove quotes from around a string."""
+    if ( ( string.startswith('"') and string.endswith('"'))
+         or (string.startswith("'") and string.endswith("'")) ):
+        return string[1:-1]
+    else:
+        return string
 
 
 """
-======================================================================
-========================== PRE-PROCESSORS ============================
-======================================================================
-
-Preprocessors munge source text before we start doing anything too
-complicated.
-
-There are two types of preprocessors: TextPreprocessor and Preprocessor.
+PRE-PROCESSORS
+=============================================================================
 
+Preprocessors work on source text before we start doing anything too
+complicated.  There are two types of preprocessors: TextPreprocessor and
+Preprocessor.
 """
 
-
 class TextPreprocessor:
     """
     TextPreprocessors are run before the text is broken into lines.
@@ -255,27 +213,22 @@ class Preprocessor:
  
 
 class HtmlBlockPreprocessor(TextPreprocessor):
-    """
-    Remove html blocks from the source text and store them for later retrieval.
-    """
+    """Remove html blocks from the text and store them for later retrieval."""
+
     right_tag_patterns = ["</%s>", "%s>"]
     
     def _get_left_tag(self, block):
         return block[1:].replace(">", " ", 1).split()[0].lower()
 
-
-    def _get_right_tag(self, left_tag, block):
-        
+    def _get_right_tag(self, left_tag, block):        
         for p in self.right_tag_patterns:
             tag = p % left_tag
             i = block.rfind(tag)
             if i > 2:
                 return tag.lstrip("<").rstrip(">"), i + len(p)-2 + len(left_tag)
-            
         return block.rstrip()[-len(left_tag)-2:-1].lower(), len(block)
 
     def _equal_tags(self, left_tag, right_tag):
-        
         if left_tag == 'div' or left_tag[0] in ['?', '@', '%']: # handle PHP, etc.
             return True
         if ("/" + left_tag) == right_tag:
@@ -291,17 +244,14 @@ class HtmlBlockPreprocessor(TextPreprocessor):
     def _is_oneliner(self, tag):
         return (tag in ['hr', 'hr/'])
 
-    
     def run(self, text):
-        """ Find and remove raw html from text. """
         new_blocks = []
-        text = text.split("\n\n")
-        
+        text = text.split("\n\n")        
         items = []
         left_tag = ''
         right_tag = ''
         in_tag = False # flag
-        
+
         while text:
             block = text[0]
             if block.startswith("\n"):
@@ -312,9 +262,7 @@ class HtmlBlockPreprocessor(TextPreprocessor):
                 block = block[1:]
 
             if not in_tag:
-
                 if block.startswith("<"):
-                    
                     left_tag = self._get_left_tag(block)
                     right_tag, data_index = self._get_right_tag(left_tag, block)
                     
@@ -380,14 +328,13 @@ HTML_BLOCK_PREPROCESSOR = HtmlBlockPreprocessor()
 
 class HeaderPreprocessor(Preprocessor):
 
-    """
-    Replace underlined headers with hashed headers to avoid
-    the need for lookahead later.
+    """Replace underlined headers with hashed headers.
+
+    (To avoid the need for lookahead later.)
+
     """
 
     def run (self, lines):
-        """ Find and replace underlined headers. """
-
         i = -1
         while i+1 < len(lines):
             i = i+1
@@ -416,14 +363,10 @@ HEADER_PREPROCESSOR = HeaderPreprocessor()
 
 
 class LinePreprocessor(Preprocessor):
-    """ 
-    Convert HR lines to "___" format 
-    """
+    """Convert HR lines to "___" format."""
     blockquote_re = re.compile(r'^(> )+')
 
     def run (self, lines):
-        """ Find and replace HR lines. """
-
         for i in range(len(lines)):
             prefix = ''
             m = self.blockquote_re.search(lines[i])
@@ -441,7 +384,7 @@ class LinePreprocessor(Preprocessor):
         if len(text) <= 2:
             return False
         for pattern in ['isline1', 'isline2', 'isline3']:
-            m = RE.regExp[pattern].match(text)
+            m = CORE_RE[pattern].match(text)
             if (m and m.group(1)):
                 return True
         else:
@@ -451,16 +394,11 @@ LINE_PREPROCESSOR = LinePreprocessor()
 
 
 class ReferencePreprocessor(Preprocessor):
-    """
-    Remove reference definitions from the text and store them for later use.
-    
-    """
-    
+    """Remove reference definitions from the text and store them for later use."""    
     def run (self, lines):
-        """ Remove and store reference defs. """
         new_text = [];
         for line in lines:
-            m = RE.regExp['reference-def'].match(line)
+            m = CORE_RE['reference-def'].match(line)
             if m:
                 id = m.group(2).strip().lower()
                 t = m.group(4).strip()  # potential title
@@ -481,20 +419,21 @@ class ReferencePreprocessor(Preprocessor):
 REFERENCE_PREPROCESSOR = ReferencePreprocessor()
 
 
+
+
 """
-======================================================================
-========================== INLINE PATTERNS ===========================
-======================================================================
+INLINE PATTERNS
+=============================================================================
 
 Inline patterns such as *emphasis* are handled by means of auxiliary
 objects, one per pattern.  Pattern objects must be instances of classes
 that extend markdown.Pattern.  Each pattern object uses a single regular
 expression and needs support the following methods:
 
-  pattern.getCompiledRegExp() - returns a regular expression
+    pattern.getCompiledRegExp() # returns a regular expression
 
-  pattern.handleMatch(m) - takes a match object and returns
-                                a ElementTree element or just plain text
+    pattern.handleMatch(m) # takes a match object and returns
+                           # an ElementTree element or just plain text
 
 All of python markdown's built-in patterns subclass from Pattern,
 but you can add additional patterns that don't.
@@ -509,20 +448,26 @@ important - e.g. if we first replace http://.../ links with <a> tags
 and _then_ try to replace inline html, we would end up with a mess.
 So, we apply the expressions in the following order:
 
-       * escape and backticks have to go before everything else, so
-         that we can preempt any markdown patterns by escaping them.
+* escape and backticks have to go before everything else, so
+  that we can preempt any markdown patterns by escaping them.
+
+* then we handle auto-links (must be done before inline html)
+
+* then we handle inline HTML.  At this point we will simply
+  replace all inline HTML strings with a placeholder and add
+  the actual HTML to a hash.
 
-       * then we handle auto-links (must be done before inline html)
+* then inline images (must be done before links)
 
-       * then we handle inline HTML.  At this point we will simply
-         replace all inline HTML strings with a placeholder and add
-         the actual HTML to a hash.
+* then bracketed links, first regular then reference-style
 
-       * then inline images (must be done before links)
+* finally we apply strong and emphasis
+"""
 
-       * then bracketed links, first regular then reference-style
 
-       * finally we apply strong and emphasis
+"""
+The actual regular expressions for patterns
+-----------------------------------------------------------------------------
 """
 
 NOBRACKET = r'[^\]\[]*'
@@ -558,6 +503,12 @@ ENTITY_RE = r'(&[\#a-zA-Z0-9]*;)'               # &amp;
 LINE_BREAK_RE = r'  \n'                     # two spaces at end of line
 LINE_BREAK_2_RE = r'  $'                    # two spaces at end of text
 
+
+"""
+The pattern classes
+-----------------------------------------------------------------------------
+"""
+
 class Pattern:
     """Base class that inline patterns subclass. """
 
@@ -581,9 +532,9 @@ class Pattern:
         return self.compiled_re
 
     def handleMatch(self, m):
-        """
-        Return a ElementTree element from the given match. Subclasses should 
-        override this method.
+        """Return a ElementTree element from the given match.
+
+        Subclasses should override this method.
 
         Keyword arguments:
 
@@ -639,8 +590,8 @@ class BacktickPattern (Pattern):
 
 
 class DoubleTagPattern (SimpleTagPattern): 
-    """ 
-    Return a ElementTree element nested in tag2 nested in tag1. 
+    """Return a ElementTree element nested in tag2 nested in tag1.
+
     Useful for strong emphasis etc.
 
     """
@@ -665,7 +616,6 @@ class HtmlPattern (Pattern):
 class LinkPattern (Pattern):
     """ Return a link element from the given match. """
     def handleMatch(self, m):
-
         el = etree.Element("a")
         el.text = m.group(2)
         title = m.group(11)
@@ -796,6 +746,14 @@ class AutomailPattern (Pattern):
         if email.startswith("mailto:"):
             email = email[len("mailto:"):]
 
+        def codepoint2name(code):
+            """Return entity definition by code, or the code if not defined."""
+            entity = htmlentitydefs.codepoint2name.get(code)
+            if entity:
+                return "%s%s;" % (AMP_SUBSTITUTE, entity)
+            else:
+                return "%s#%d;" % (AMP_SUBSTITUTE, code)
+
         letters = [codepoint2name(ord(letter)) for letter in email]
         el.text = AtomicString(''.join(letters))
 
@@ -831,18 +789,16 @@ AUTOMAIL_PATTERN        = AutomailPattern(AUTOMAIL_RE)
 
 
 """
-======================================================================
-========================== POST-PROCESSORS ===========================
-======================================================================
+POST-PROCESSORS
+=============================================================================
 
-Markdown also allows post-processors, which are similar to
-preprocessors in that they need to implement a "run" method. However,
-they are run after core processing.
+Markdown also allows post-processors, which are similar to preprocessors in
+that they need to implement a "run" method. However, they are run after core
+processing.
 
 There are two types of post-processors: Postprocessor and TextPostprocessor
 """
 
-
 class Postprocessor:
     """
     Postprocessors are run before the ElementTree serialization.
@@ -863,7 +819,6 @@ class Postprocessor:
         pass
 
 
-
 class TextPostprocessor:
     """
     TextPostprocessors are run after the ElementTree it converted back into text.
@@ -884,12 +839,11 @@ class TextPostprocessor:
         """
         pass
 
-class PrettifyPostprocessor(Postprocessor):
-    """ Add linebreaks to the html document. """
 
+class PrettifyPostprocessor(Postprocessor):
+    """Add linebreaks to the html document."""
     def _prettifyETree(self, elem):
-        """ Recursively add linebreaks to ElementTree children. """
-     
+        """Recursively add linebreaks to ElementTree children."""
         i = "\n"
         if isBlockLevel(elem.tag) and elem.tag not in ['code', 'pre']:
             if (not elem.text or not elem.text.strip()) \
@@ -904,8 +858,7 @@ class PrettifyPostprocessor(Postprocessor):
             elem.tail = i
 
     def run(self, root):
-        """ Add linebreaks to ElementTree root object """
-        
+        """.Add linebreaks to ElementTree root object."""
         self._prettifyETree(root)
         # Do <br />'s seperately as they are often in the middle of
         # inline content and missed by _prettifyETree.
@@ -965,11 +918,15 @@ AMPSUBSTITUTETEXTPOSTPROCESSOR = AndSubstitutePostprocessor()
 
 
 """
-======================================================================
-========================== MISC AUXILIARY CLASSES ====================
-======================================================================
+MISC AUXILIARY CLASSES
+=============================================================================
 """
 
+class AtomicString(unicode):
+    """A string which should not be further processed."""
+    pass
+
+
 class HtmlStash:
     """
     This class is used for stashing HTML objects that we extract
@@ -1004,98 +961,6 @@ class HtmlStash:
         self.html_counter = 0
         self.rawHtmlBlocks = []
 
-
-class BlockGuru:
-    """ Parse document for block level constructs (paragraphs, lists, etc.)."""
-
-    def _findHead(self, lines, fn, allowBlank=0):
-
-        """
-        Functional magic to help determine boundaries of indented
-        blocks.
-
-        Keyword arguments:
-        
-        * lines: an array of strings
-        * fn: a function that returns a substring of a string
-           if the string matches the necessary criteria
-        * allowBlank: specifies whether it's ok to have blank
-           lines between matching functions
-        
-        Returns: a list of post processes items and the unused
-        remainder of the original list
-        
-        """
-        items = []
-        item = -1
-
-        i = 0 # to keep track of where we are
-
-        for line in lines:
-
-            if not line.strip() and not allowBlank:
-                return items, lines[i:]
-
-            if not line.strip() and allowBlank:
-                # If we see a blank line, this _might_ be the end
-                i += 1
-
-                # Find the next non-blank line
-                for j in range(i, len(lines)):
-                    if lines[j].strip():
-                        next = lines[j]
-                        break
-                else:
-                    # There is no more text => this is the end
-                    break
-
-                # Check if the next non-blank line is still a part of the list
-
-                part = fn(next)
-
-                if part:
-                    items.append("")
-                    continue
-                else:
-                    break # found end of the list
-
-            part = fn(line)
-
-            if part:
-                items.append(part)
-                i += 1
-                continue
-            else:
-                return items, lines[i:]
-        else:
-            i += 1
-
-        return items, lines[i:]
-
-
-    def detabbed_fn(self, line):
-        """ An auxiliary method to be passed to _findHead """
-        m = RE.regExp['tabbed'].match(line)
-        if m:
-            return m.group(4)
-        else:
-            return None
-
-
-    def detectTabbed(self, lines):
-        """ Find indented text and remove indent before further proccesing. """
-        return self._findHead(lines, self.detabbed_fn,
-                              allowBlank = 1)
-
-
-def dequote(string):
-    """ Removes quotes from around a string """
-    if ( ( string.startswith('"') and string.endswith('"'))
-         or (string.startswith("'") and string.endswith("'")) ):
-        return string[1:-1]
-    else:
-        return string
-    
     
 class InlineStash:
     
@@ -1150,52 +1015,34 @@ class InlineStash:
         self._nodes = {}
     
 """
-======================================================================
-========================== CORE MARKDOWN =============================
-======================================================================
+CORE MARKDOWN
+=============================================================================
 
-This stuff is hard, so if you are thinking of extending the syntax,
-see first if you can do it via pre-processors, post-processors,
-inline patterns or a combination of the three.
+The core part is still quite messy, despite substantial refactoring.  If you
+are thinking of extending the syntax, see first if you can do it through
+pre-processors, post-processors, inline patterns or a combination of the three.
 """
 
-class CorePatterns:
-    """
-    This class is scheduled for removal as part of a refactoring effort.
-    """
-
-    patterns = {
-        'header':          r'(#{1,6})[ \t]*(.*?)[ \t]*(#*)', # # A title
-        'reference-def':   r'(\ ?\ ?\ ?)\[([^\]]*)\]:\s*([^ ]*)(.*)',
-                           # [Google]: http://www.google.com/
-        'containsline':    r'([-]*)$|^([=]*)', # -----, =====, etc.
-        'ol':              r'[ ]{0,3}[\d]*\.\s+(.*)', # 1. text
-        'ul':              r'[ ]{0,3}[*+-]\s+(.*)', # "* text"
-        'isline1':         r'(\**)', # ***
-        'isline2':         r'(\-*)', # ---
-        'isline3':         r'(\_*)', # ___
-        'tabbed':          r'((\t)|(    ))(.*)', # an indented line
-        'quoted':          r'[ ]{0,2}> ?(.*)', # a quoted block ("> ...")
-    }
-
-    def __init__ (self):
-
-        self.regExp = {}
-        for key in self.patterns.keys():
-            self.regExp[key] = re.compile("^%s$" % self.patterns[key],
-                                          re.DOTALL)
-
-        self.regExp['containsline'] = re.compile(r'^([-]*)$|^([=]*)$', re.M)
-        self.regExp['attr'] = re.compile("\{@([^\}]*)=([^\}]*)}") # {@id=123}
-
-RE = CorePatterns()
+def _wrapRe(raw_re) : return re.compile("^%s$" % raw_re, re.DOTALL)
+CORE_RE = {
+    'header':          _wrapRe(r'(#{1,6})[ \t]*(.*?)[ \t]*(#*)'), # # A title
+    'reference-def':   _wrapRe(r'(\ ?\ ?\ ?)\[([^\]]*)\]:\s*([^ ]*)(.*)'),
+                               # [Google]: http://www.google.com/
+    'containsline':    _wrapRe(r'([-]*)$|^([=]*)'), # -----, =====, etc.
+    'ol':              _wrapRe(r'[ ]{0,3}[\d]*\.\s+(.*)'), # 1. text
+    'ul':              _wrapRe(r'[ ]{0,3}[*+-]\s+(.*)'), # "* text"
+    'isline1':         _wrapRe(r'(\**)'), # ***
+    'isline2':         _wrapRe(r'(\-*)'), # ---
+    'isline3':         _wrapRe(r'(\_*)'), # ___
+    'tabbed':          _wrapRe(r'((\t)|(    ))(.*)'), # an indented line
+    'quoted':          _wrapRe(r'[ ]{0,2}> ?(.*)'), # a quoted block ("> ...")
+    'containsline':    re.compile(r'^([-]*)$|^([=]*)$', re.M),
+    'attr':            re.compile("\{@([^\}]*)=([^\}]*)}") # {@id=123}
+}
 
 
 class Markdown:
-    """ 
-    Markdown formatter class for creating an html document from Markdown text.
-    """
-
+    """Converts markdown to HTML."""
 
     def __init__(self, 
                  extensions=[],
@@ -1216,7 +1063,6 @@ class Markdown:
         """
         self.source = None
         self.safeMode = safe_mode
-        self.blockGuru = BlockGuru()
         self.registeredExtensions = []
         self.docType = ""
         self.stripTopLevelTags = True
@@ -1397,7 +1243,7 @@ class Markdown:
                           'tabbed': self._processCodeBlock}
 
             for regexp in ['ul', 'ol', 'quoted', 'tabbed']:
-                m = RE.regExp[regexp].match(lines[0])
+                m = CORE_RE[regexp].match(lines[0])
                 if m:
                     processFn[regexp](parent_elem, lines, inList)
                     return
@@ -1420,8 +1266,8 @@ class Markdown:
             if inList:
 
                 start, lines  = self._linesUntil(lines, (lambda line:
-                                 RE.regExp['ul'].match(line)
-                                 or RE.regExp['ol'].match(line)
+                                 CORE_RE['ul'].match(line)
+                                 or CORE_RE['ol'].match(line)
                                                   or not line.strip()))
 
                 self._processSection(parent_elem, start,
@@ -1437,7 +1283,7 @@ class Markdown:
                     self._processHeader(parent_elem, paragraph)
                     
                 elif len(paragraph) and \
-                RE.regExp["isline3"].match(paragraph[0]):
+                CORE_RE["isline3"].match(paragraph[0]):
 
                     self._processHR(parent_elem)
                     lines = paragraph[1:] + lines
@@ -1453,7 +1299,7 @@ class Markdown:
         hr = etree.SubElement(parentElem, "hr")
     
     def _processHeader(self, parentElem, paragraph):
-        m = RE.regExp['header'].match(paragraph[0])
+        m = CORE_RE['header'].match(paragraph[0])
         if m:
             level = len(m.group(1))
             h = etree.SubElement(parentElem, "h%d" % level)
@@ -1480,7 +1326,7 @@ class Markdown:
         # Searching for hr or header
         for line in paragraph:
             # it's hr
-            if RE.regExp["isline3"].match(line):
+            if CORE_RE["isline3"].match(line):
                 el.text = "\n".join(dump)
                 self._processHR(el)
                 dump = []
@@ -1548,8 +1394,8 @@ class Markdown:
 
                 # Check if the next non-blank line is still a part of the list
 
-                if ( RE.regExp[listexpr].match(next) or
-                     RE.regExp['tabbed'].match(next) ):
+                if ( CORE_RE[listexpr].match(next) or
+                     CORE_RE['tabbed'].match(next) ):
                     # get rid of any white space in the line
                     items[item].append(line.strip())
                     looseList = loose or looseList
@@ -1562,7 +1408,7 @@ class Markdown:
 
             for expr in ['ul', 'ol', 'tabbed']:
 
-                m = RE.regExp[expr].match(line)
+                m = CORE_RE[expr].match(line)
                 if m:
                     if expr in ['ul', 'ol']:  # We are looking at a new item
                         #if m.group(1) :
@@ -1628,7 +1474,7 @@ class Markdown:
         i = 0
         blank_line = False # allow one blank line between paragraphs
         for line in lines:
-            m = RE.regExp['quoted'].match(line)
+            m = CORE_RE['quoted'].match(line)
             if m:
                 dequoted.append(m.group(1))
                 i += 1
@@ -1667,7 +1513,7 @@ class Markdown:
         Returns: None
         
         """
-        detabbed, theRest = self.blockGuru.detectTabbed(lines)
+        detabbed, theRest = self.detectTabbed(lines)
 
         pre = etree.SubElement(parentElem, "pre")
         code = etree.SubElement(pre, "code")
@@ -1675,6 +1521,59 @@ class Markdown:
         text = "\n".join(detabbed).rstrip()+"\n"
         code.text = AtomicString(text)
         self._processSection(parentElem, theRest, inList)        
+
+    def detectTabbed(self, lines):
+        """ Find indented text and remove indent before further proccesing.
+
+        Keyword arguments:
+        
+        * lines: an array of strings
+        * fn: a function that returns a substring of a string
+           if the string matches the necessary criteria
+        
+        Returns: a list of post processes items and the unused
+        remainder of the original list
+        
+        """
+        items = []
+        item = -1
+        i = 0 # to keep track of where we are
+
+        def detab(line):
+            match = CORE_RE['tabbed'].match(line)
+            if match:
+               return match.group(4)
+
+        for line in lines:
+            if line.strip(): # Non-blank line
+                line = detab(line)
+                if line:
+                    items.append(line)
+                    i += 1
+                    continue
+                else:
+                    return items, lines[i:]
+
+            else: # Blank line: _maybe_ we are done.
+                i += 1 # advance
+
+                # Find the next non-blank line
+                for j in range(i, len(lines)):  
+                    if lines[j].strip():
+                        next_line = lines[j]; break
+                else:
+                    break # There is no more text; we are done.
+
+                # Check if the next non-blank line is tabbed
+                if detab(next_line): # Yes, more work to do.
+                    items.append("")
+                    continue
+                else:
+                    break # No, we are done.
+        else:
+            i += 1
+
+        return items, lines[i:]
         
     def _handleInline(self, data, patternIndex=0):
         """
@@ -1730,11 +1629,11 @@ class Markdown:
         if node is None:
             return data, True, len(leftData) + match.span(len(match.groups()))[0]
         
-        if not isstr(node):         
+        if not isString(node):         
             if not isinstance(node.text, AtomicString):
                 # We need to process current node too
                 for child in [node] + node.getchildren():
-                    if not isstr(node):
+                    if not isString(node):
                         if child.text:
                             child.text = self._handleInline(child.text, 
                                                             patternIndex + 1)
@@ -1824,14 +1723,14 @@ class Markdown:
                         text = data[strartIndex:index]
                         linkText(text)
           
-                    if not isstr(node): # it's Element
+                    if not isString(node): # it's Element
                         
                         for child in [node] + node.getchildren():
             
                             if child.tail:
                                 if child.tail.strip():
                                     self._processElementText(node, child, False)
-                            
+
                             if child.text:
                                 if child.text.strip():
                                     self._processElementText(child, child)
@@ -1872,35 +1771,26 @@ class Markdown:
 
         Returns: ElementTree object with applied inline patterns.
         """
-        el = markdownTree.getroot()
-                
-        stack = [el]
+        stack = [markdownTree.getroot()]
 
         while stack:
             currElement = stack.pop()
             insertQueue = []
             for child in currElement.getchildren():
-
-                if not isinstance(child.text, AtomicString) and child.text:
-
+                if child.text and not isinstance(child.text, AtomicString):
                     text = child.text
                     child.text = None
                     lst = self._processPlaceholders(self._handleInline(
                                                     text), child)
                     stack += lst
-                    
-
                     insertQueue.append((child, lst))
                     
-                
                 if child.getchildren():
                     stack.append(child) 
 
-                      
             for element, lst in insertQueue:
                 if element.text:
-                    element.text = handleAttributes(element.text, 
-                                                        element)
+                    element.text = handleAttributes(element.text, element)
                 i = 0
                 for newChild in lst:
                     # Processing attributes
@@ -1913,13 +1803,10 @@ class Markdown:
                     element.insert(i, newChild)
                     i += 1
                
-            
         return markdownTree
 
-        
     def markdownToTree(self, source=None):
-        """
-        Create ElementTree, without applying inline paterns.
+        """Create ElementTree, without applying inline paterns.
         
         Keyword arguments:
         
@@ -1934,10 +1821,8 @@ class Markdown:
             return u""
         
         # Fixup the source text
-
         self.source = self.source.replace(STX, "")
         self.source = self.source.replace(ETX, "")
-
         self.source = self.source.replace("\r\n", "\n").replace("\r", "\n")
         self.source += "\n\n"
         self.source = self.source.expandtabs(TAB_LENGTH)
@@ -1950,8 +1835,7 @@ class Markdown:
         return markdownTree      
 
     def convert (self, source):
-        """
-        Create the document in XHTML format.
+        """Create the document in XHTML format.
 
         Keyword arguments:
         
@@ -1986,10 +1870,8 @@ class Markdown:
 
         return xml.strip()
 
-
     def __str__(self):
-        ''' Report info about instance. Markdown always returns unicode. '''
-
+        """ Report info about instance. Markdown always returns unicode."""
         if self.source is None:
             status = 'in which no source text has been assinged.'
         else:
@@ -2000,26 +1882,29 @@ class Markdown:
     __unicode__ = convert # markdown should always return a unicode string
 
 
+"""
+EXPORTED FUNCTIONS
+=============================================================================
 
-
-
-# ====================================================================
+Those are the two functions we really mean to export: markdown() and
+markdownFromFile().
+"""
 
 def markdownFromFile(input = None,
                      output = None,
                      extensions = [],
                      encoding = None,
                      safe = False):
-    """
-    Convenience wrapper function that takes a filename as input.
+    """Converts a markdown file and returns the HTML as a unicode string.
 
     Used from the command-line, although may be useful in other situations. 
     Decodes the file using the provided encoding (defaults to utf-8), passes 
     the file content to markdown, and outputs the html to either the provided
     filename or stdout in the same encoding as the source file.
 
-    **Note:** This is the only place that decoding and encoding takes place
-    in Python-Markdown.
+    **Note:** This is the only place that decoding and encoding of unicode
+    takes place in Python-Markdown.  (All other code is unicode-in /
+    unicode-out.)
 
     Keyword arguments:
 
@@ -2029,30 +1914,26 @@ def markdownFromFile(input = None,
     * encoding: Encoding of input and output files. Defaults to utf-8.
     * safe_mode: Disallow raw html. One of "remove", "replace" or "escape".
 
-    Returns: An HTML document as a string.
-
     """
+    
+    encoding = encoding or "utf-8"
 
-    message(DEBUG, "input file: %s" % input)
-
-    if not encoding:
-        encoding = "utf-8"
-
+    # Read the source
     input_file = codecs.open(input, mode="r", encoding=encoding)
     text = input_file.read()
     input_file.close()
+    text = text.lstrip(u'\ufeff') # remove the byte-order mark
 
-    text = removeBOM(text, encoding)
-
-    new_text = markdown(text, extensions, safe_mode = safe)
+    # Convert
+    html = markdown(text, extensions, safe_mode = safe)
 
+    # Write to file or stdout
     if output:
         output_file = codecs.open(output, "w", encoding=encoding)
-        output_file.write(new_text)
+        output_file.write(html)
         output_file.close()
-
     else:
-        sys.stdout.write(new_text.encode(encoding))
+        sys.stdout.write(html.encode(encoding))
 
 def markdown(text,
              extensions = [],
@@ -2082,11 +1963,15 @@ def markdown(text,
     return md.convert(text)
         
 
+"""
+Extensions
+-----------------------------------------------------------------------------
+"""
+
 class Extension:
     """ Base class for extensions to subclass. """
     def __init__(self, configs = {}):
-        """ 
-        Create an instance of an Extention. 
+        """Create an instance of an Extention. 
         
         Keyword arguments:
 
@@ -2169,9 +2054,19 @@ def load_extension(ext_name, configs = []):
     return module.makeExtension(configs.items())    
 
 
-#############################################################################
-##    Only command-line specific stuff from here down.
-#############################################################################
+# Extensions should use "markdown.etree" instead of "etree" (or do `from
+# markdown import etree`).  Do not import it by yourself.
+
+etree = importETree() 
+
+
+"""
+COMMAND-LINE SPECIFIC STUFF
+=============================================================================
+
+The rest of the code is specifically for handling the case where Python
+Markdown is called from the command line.
+"""
 
 OPTPARSE_WARNING = """
 Python 2.3 or higher required for advanced command line options.
diff --git a/markdown_extensions/codehilite.py b/markdown_extensions/codehilite.py
index a96aaaa..7f4a1a8 100644
--- a/markdown_extensions/codehilite.py
+++ b/markdown_extensions/codehilite.py
@@ -208,7 +208,7 @@ class CodeHiliteExtention(markdown.Extension):
 
             """
 
-            detabbed, theRest = md.blockGuru.detectTabbed(lines)
+            detabbed, theRest = md.detectTabbed(lines)
             text = "\n".join(detabbed).rstrip()+"\n"
             code = CodeHilite(text, linenos=self.config['force_linenos'][0],
                               css_class=self.config['css_class'][0])
author	Yuri Takhteyev <yuri@freewisdom.org>	2008-10-07 01:32:56 -0700
committer	Yuri Takhteyev <yuri@freewisdom.org>	2008-10-07 01:32:56 -0700
commit	40b8986ccf0ea3fa37dda469b46261dfbf0c25a4 (patch)
tree	c29877335eea157e1bfcea87c7332e1ab4f6ff6a
parent	6d719bd60b31e7fad3aae345a30f2820e2fd6215 (diff)
download	markdown-40b8986ccf0ea3fa37dda469b46261dfbf0c25a4.tar.gz markdown-40b8986ccf0ea3fa37dda469b46261dfbf0c25a4.tar.bz2 markdown-40b8986ccf0ea3fa37dda469b46261dfbf0c25a4.zip