From 40b8986ccf0ea3fa37dda469b46261dfbf0c25a4 Mon Sep 17 00:00:00 2001 From: Yuri Takhteyev Date: Tue, 7 Oct 2008 01:32:56 -0700 Subject: All sorts of cleanup. The bigger changes include getting rid of old BOM-removal logic and getting rid of BlockGuru. Most of the changes are just re-ordering of functions, removal of whitespace, adding comments, etc. --- markdown.py | 687 ++++++++++++++++---------------------- markdown_extensions/codehilite.py | 2 +- 2 files changed, 292 insertions(+), 397 deletions(-) diff --git a/markdown.py b/markdown.py index 52f278e..95bf61d 100755 --- a/markdown.py +++ b/markdown.py @@ -42,174 +42,132 @@ License: BSD (see docs/LICENSE for details). version = "2.0-alpha" version_info = (2,0,0, "beta") -import re, sys, codecs, htmlentitydefs +import re +import sys +import codecs +import htmlentitydefs import logging from logging import DEBUG, INFO, WARN, ERROR, CRITICAL from urlparse import urlparse, urlunparse -# --------------- Constants you might want to modify ------------------------ -COMMAND_LINE_LOGGING_LEVEL = CRITICAL -TAB_LENGTH = 4 # expand tabs to this many spaces -ENABLE_ATTRIBUTES = True # @id = xyz -> <... id="xyz"> +""" +CONSTANTS +============================================================================= +""" + +""" +Constants you might want to modify +----------------------------------------------------------------------------- +""" + +# default logging level for command-line use +COMMAND_LINE_LOGGING_LEVEL = CRITICAL +TAB_LENGTH = 4 # expand tabs to this many spaces +ENABLE_ATTRIBUTES = True # @id = xyz -> <... id="xyz"> SMART_EMPHASIS = True # this_or_that does not become thisorthat HTML_REMOVED_TEXT = "[HTML_REMOVED]" # text used instead of HTML in safe mode +BLOCK_LEVEL_ELEMENTS = re.compile("p|div|h[1-6]|blockquote|pre|table|dl|ol|ul" + +"|script|noscript|form|fieldset|iframe|math" + +"|ins|del|hr|hr/|style|li|tr") + +""" +Constants you probably do not need to change +----------------------------------------------------------------------------- +""" +RTL_BIDI_RANGES = ( (u'\u0590', u'\u07FF'), + # Hebrew (0590-05FF), Arabic (0600-06FF), + # Syriac (0700-074F), Arabic supplement (0750-077F), + # Thaana (0780-07BF), Nko (07C0-07FF). + (u'\u2D30', u'\u2D7F'), # Tifinagh + ) -# --------------- Auxiliary functions --------------------------------------- +EXECUTABLE_NAME_FOR_USAGE = "python markdown.py" +""" The name used in the usage statement displayed for python versions < 2.3. +(With python 2.3 and higher the usage statement is generated by optparse +and uses the actual name of the executable called.) """ + +# Placeholders +STX = u'\u0002' # Use STX ("Start of text") for start-of-placeholder +ETX = u'\u0003' # Use ETX ("End of text") for end-of-placeholder +HTML_PLACEHOLDER_PREFIX = STX+"wzxhzdk:" +HTML_PLACEHOLDER = HTML_PLACEHOLDER_PREFIX + "%d" + ETX +INLINE_PLACEHOLDER_PREFIX = STX+"klzzwxh:" +INLINE_PLACEHOLDER = INLINE_PLACEHOLDER_PREFIX + "%s" + ETX +AMP_SUBSTITUTE = STX+"amp"+ETX + + +""" +AUXILIARY GLOBAL FUNCTIONS +============================================================================= +""" def message(level, text): - ''' A wrapper method for logging debug messages. ''' + """ A wrapper method for logging debug messages. """ logging.getLogger('MARKDOWN').log(level, text) -def isstr(s): +def isString(s): """ Check if it's string """ return isinstance(s, unicode) or isinstance(s, str) ## Import def importETree(): - """ Import best variant of ElementTree and return module object """ - cetree = None - try: - # Python 2.5+ - import xml.etree.cElementTree as cetree + """Import the best implementation of ElementTree, return a module object.""" + etree_in_c = None + try: # Is it Python 2.5+ with C implemenation of ElementTree installed? + import xml.etree.cElementTree as etree_in_c except ImportError: - try: - # Python 2.5+ + try: # Is it Python 2.5+ with Python implementation of ElementTree? import xml.etree.ElementTree as etree except ImportError: - try: - # normal cElementTree install - import cElementTree as cetree + try: # An earlier version of Python with cElementTree installed? + import cElementTree as etree_in_c except ImportError: - try: - # normal ElementTree install + try: # An earlier version of Python with Python ElementTree? import elementtree.ElementTree as etree except ImportError: - message(CRITICAL, - "Failed to import ElementTree from any known place") + message(CRITICAL, "Failed to import ElementTree") sys.exit(1) - if cetree: - if cetree.VERSION < "1.0": - message(CRITICAL, - "cElementTree version is too old, 1.0 and upper required") - sys.exit(1) - - etree = cetree - else: - if etree.VERSION < "1.1": - message(CRITICAL, - "ElementTree version is too old, 1.1 and upper required") - sys.exit(1) - - return etree - -"""ElementTree module -in extensions use: `from markdown import etree` -to access to the ElemetTree module, do not import it by yourself""" -etree = importETree() - -RTL_BIDI_RANGES = ( (u'\u0590', u'\u07FF'), - # from Hebrew to Nko (includes Arabic, Syriac and Thaana) - (u'\u2D30', u'\u2D7F'), - # Tifinagh - ) - -# Unicode Reference Table: -# 0590-05FF - Hebrew -# 0600-06FF - Arabic -# 0700-074F - Syriac -# 0750-077F - Arabic Supplement -# 0780-07BF - Thaana -# 07C0-07FF - Nko - -BOMS = { 'utf-8': (codecs.BOM_UTF8, ), - 'utf-16': (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE), - #'utf-32': (codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE) - } - -def removeBOM(text, encoding): - """ - Used by `markdownFromFile` to remove a "byte order mark" from the begining - of an utf-8, utf-16 or utf-32 encoded file. - """ + if etree_in_c and etree_in_c.VERSION < "1.0": + message(CRITICAL, "For cElementTree version 1.0 or higher is required.") + sys.exit(1) + elif etree_in_c : + return etree_in_c + elif etree.VERSION < "1.1": + message(CRITICAL, "For ElementTree version 1.1 or higher is required") + sys.exit(1) + else : + return etree - convert = isinstance(text, unicode) - for bom in BOMS[encoding]: - bom = convert and bom.decode(encoding) or bom - if text.startswith(bom): - return text.lstrip(bom) - return text - - -# The following constant specifies the name used in the usage -# statement displayed for python versions lower than 2.3. (With -# python2.3 and higher the usage statement is generated by optparse -# and uses the actual name of the executable called.) - -EXECUTABLE_NAME_FOR_USAGE = "python markdown.py" - - -# --------------- CONSTANTS YOU _SHOULD NOT_ HAVE TO CHANGE ---------- - - -# placeholders -STX = u'\u0002' # Use STX ("Start of text") for start-of-placeholder -ETX = u'\u0003' # Use ETX ("End of text") for end-of-placeholder -HTML_PLACEHOLDER_PREFIX = STX+"wzxhzdk:" -HTML_PLACEHOLDER = HTML_PLACEHOLDER_PREFIX + "%d" + ETX -INLINE_PLACEHOLDER_PREFIX = STX+"klzzwxh:" -INLINE_PLACEHOLDER = INLINE_PLACEHOLDER_PREFIX + "%s" + ETX - -AMP_SUBSTITUTE = STX+"amp"+ETX - -BLOCK_LEVEL_ELEMENTS = re.compile('p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del|hr|hr/|style|li|tr') - def isBlockLevel(tag): - """ - Used by HTMLBlockPreprocessor to check if a given tag is a block level - element. - """ + """Check if the tag is a block level HTML tag.""" return BLOCK_LEVEL_ELEMENTS.match(tag) - -def codepoint2name(code): - """ - Return entity definition by code, or code - if there is no such entity definition - """ - entity = htmlentitydefs.codepoint2name.get(code) - if entity: - return "%s%s;" % (AMP_SUBSTITUTE, entity) - else: - return "%s#%d;" % (AMP_SUBSTITUTE, code) - def handleAttributes(text, parent): - """ Handale attributes, e.g {@id=123} """ + """Set values of an element based on attribute definitions ({@id=123}).""" def attributeCallback(match): parent.set(match.group(1), match.group(2)) + return CORE_RE['attr'].sub(attributeCallback, text) - return RE.regExp['attr'].sub(attributeCallback, text) - - -class AtomicString(unicode): - "A string which should not be further processed." - pass +def dequote(string): + """Remove quotes from around a string.""" + if ( ( string.startswith('"') and string.endswith('"')) + or (string.startswith("'") and string.endswith("'")) ): + return string[1:-1] + else: + return string """ -====================================================================== -========================== PRE-PROCESSORS ============================ -====================================================================== - -Preprocessors munge source text before we start doing anything too -complicated. - -There are two types of preprocessors: TextPreprocessor and Preprocessor. +PRE-PROCESSORS +============================================================================= +Preprocessors work on source text before we start doing anything too +complicated. There are two types of preprocessors: TextPreprocessor and +Preprocessor. """ - class TextPreprocessor: """ TextPreprocessors are run before the text is broken into lines. @@ -255,27 +213,22 @@ class Preprocessor: class HtmlBlockPreprocessor(TextPreprocessor): - """ - Remove html blocks from the source text and store them for later retrieval. - """ + """Remove html blocks from the text and store them for later retrieval.""" + right_tag_patterns = ["", "%s>"] def _get_left_tag(self, block): return block[1:].replace(">", " ", 1).split()[0].lower() - - def _get_right_tag(self, left_tag, block): - + def _get_right_tag(self, left_tag, block): for p in self.right_tag_patterns: tag = p % left_tag i = block.rfind(tag) if i > 2: return tag.lstrip("<").rstrip(">"), i + len(p)-2 + len(left_tag) - return block.rstrip()[-len(left_tag)-2:-1].lower(), len(block) def _equal_tags(self, left_tag, right_tag): - if left_tag == 'div' or left_tag[0] in ['?', '@', '%']: # handle PHP, etc. return True if ("/" + left_tag) == right_tag: @@ -291,17 +244,14 @@ class HtmlBlockPreprocessor(TextPreprocessor): def _is_oneliner(self, tag): return (tag in ['hr', 'hr/']) - def run(self, text): - """ Find and remove raw html from text. """ new_blocks = [] - text = text.split("\n\n") - + text = text.split("\n\n") items = [] left_tag = '' right_tag = '' in_tag = False # flag - + while text: block = text[0] if block.startswith("\n"): @@ -312,9 +262,7 @@ class HtmlBlockPreprocessor(TextPreprocessor): block = block[1:] if not in_tag: - if block.startswith("<"): - left_tag = self._get_left_tag(block) right_tag, data_index = self._get_right_tag(left_tag, block) @@ -380,14 +328,13 @@ HTML_BLOCK_PREPROCESSOR = HtmlBlockPreprocessor() class HeaderPreprocessor(Preprocessor): - """ - Replace underlined headers with hashed headers to avoid - the need for lookahead later. + """Replace underlined headers with hashed headers. + + (To avoid the need for lookahead later.) + """ def run (self, lines): - """ Find and replace underlined headers. """ - i = -1 while i+1 < len(lines): i = i+1 @@ -416,14 +363,10 @@ HEADER_PREPROCESSOR = HeaderPreprocessor() class LinePreprocessor(Preprocessor): - """ - Convert HR lines to "___" format - """ + """Convert HR lines to "___" format.""" blockquote_re = re.compile(r'^(> )+') def run (self, lines): - """ Find and replace HR lines. """ - for i in range(len(lines)): prefix = '' m = self.blockquote_re.search(lines[i]) @@ -441,7 +384,7 @@ class LinePreprocessor(Preprocessor): if len(text) <= 2: return False for pattern in ['isline1', 'isline2', 'isline3']: - m = RE.regExp[pattern].match(text) + m = CORE_RE[pattern].match(text) if (m and m.group(1)): return True else: @@ -451,16 +394,11 @@ LINE_PREPROCESSOR = LinePreprocessor() class ReferencePreprocessor(Preprocessor): - """ - Remove reference definitions from the text and store them for later use. - - """ - + """Remove reference definitions from the text and store them for later use.""" def run (self, lines): - """ Remove and store reference defs. """ new_text = []; for line in lines: - m = RE.regExp['reference-def'].match(line) + m = CORE_RE['reference-def'].match(line) if m: id = m.group(2).strip().lower() t = m.group(4).strip() # potential title @@ -481,20 +419,21 @@ class ReferencePreprocessor(Preprocessor): REFERENCE_PREPROCESSOR = ReferencePreprocessor() + + """ -====================================================================== -========================== INLINE PATTERNS =========================== -====================================================================== +INLINE PATTERNS +============================================================================= Inline patterns such as *emphasis* are handled by means of auxiliary objects, one per pattern. Pattern objects must be instances of classes that extend markdown.Pattern. Each pattern object uses a single regular expression and needs support the following methods: - pattern.getCompiledRegExp() - returns a regular expression + pattern.getCompiledRegExp() # returns a regular expression - pattern.handleMatch(m) - takes a match object and returns - a ElementTree element or just plain text + pattern.handleMatch(m) # takes a match object and returns + # an ElementTree element or just plain text All of python markdown's built-in patterns subclass from Pattern, but you can add additional patterns that don't. @@ -509,20 +448,26 @@ important - e.g. if we first replace http://.../ links with tags and _then_ try to replace inline html, we would end up with a mess. So, we apply the expressions in the following order: - * escape and backticks have to go before everything else, so - that we can preempt any markdown patterns by escaping them. +* escape and backticks have to go before everything else, so + that we can preempt any markdown patterns by escaping them. + +* then we handle auto-links (must be done before inline html) + +* then we handle inline HTML. At this point we will simply + replace all inline HTML strings with a placeholder and add + the actual HTML to a hash. - * then we handle auto-links (must be done before inline html) +* then inline images (must be done before links) - * then we handle inline HTML. At this point we will simply - replace all inline HTML strings with a placeholder and add - the actual HTML to a hash. +* then bracketed links, first regular then reference-style - * then inline images (must be done before links) +* finally we apply strong and emphasis +""" - * then bracketed links, first regular then reference-style - * finally we apply strong and emphasis +""" +The actual regular expressions for patterns +----------------------------------------------------------------------------- """ NOBRACKET = r'[^\]\[]*' @@ -558,6 +503,12 @@ ENTITY_RE = r'(&[\#a-zA-Z0-9]*;)' # & LINE_BREAK_RE = r' \n' # two spaces at end of line LINE_BREAK_2_RE = r' $' # two spaces at end of text + +""" +The pattern classes +----------------------------------------------------------------------------- +""" + class Pattern: """Base class that inline patterns subclass. """ @@ -581,9 +532,9 @@ class Pattern: return self.compiled_re def handleMatch(self, m): - """ - Return a ElementTree element from the given match. Subclasses should - override this method. + """Return a ElementTree element from the given match. + + Subclasses should override this method. Keyword arguments: @@ -639,8 +590,8 @@ class BacktickPattern (Pattern): class DoubleTagPattern (SimpleTagPattern): - """ - Return a ElementTree element nested in tag2 nested in tag1. + """Return a ElementTree element nested in tag2 nested in tag1. + Useful for strong emphasis etc. """ @@ -665,7 +616,6 @@ class HtmlPattern (Pattern): class LinkPattern (Pattern): """ Return a link element from the given match. """ def handleMatch(self, m): - el = etree.Element("a") el.text = m.group(2) title = m.group(11) @@ -796,6 +746,14 @@ class AutomailPattern (Pattern): if email.startswith("mailto:"): email = email[len("mailto:"):] + def codepoint2name(code): + """Return entity definition by code, or the code if not defined.""" + entity = htmlentitydefs.codepoint2name.get(code) + if entity: + return "%s%s;" % (AMP_SUBSTITUTE, entity) + else: + return "%s#%d;" % (AMP_SUBSTITUTE, code) + letters = [codepoint2name(ord(letter)) for letter in email] el.text = AtomicString(''.join(letters)) @@ -831,18 +789,16 @@ AUTOMAIL_PATTERN = AutomailPattern(AUTOMAIL_RE) """ -====================================================================== -========================== POST-PROCESSORS =========================== -====================================================================== +POST-PROCESSORS +============================================================================= -Markdown also allows post-processors, which are similar to -preprocessors in that they need to implement a "run" method. However, -they are run after core processing. +Markdown also allows post-processors, which are similar to preprocessors in +that they need to implement a "run" method. However, they are run after core +processing. There are two types of post-processors: Postprocessor and TextPostprocessor """ - class Postprocessor: """ Postprocessors are run before the ElementTree serialization. @@ -863,7 +819,6 @@ class Postprocessor: pass - class TextPostprocessor: """ TextPostprocessors are run after the ElementTree it converted back into text. @@ -884,12 +839,11 @@ class TextPostprocessor: """ pass -class PrettifyPostprocessor(Postprocessor): - """ Add linebreaks to the html document. """ +class PrettifyPostprocessor(Postprocessor): + """Add linebreaks to the html document.""" def _prettifyETree(self, elem): - """ Recursively add linebreaks to ElementTree children. """ - + """Recursively add linebreaks to ElementTree children.""" i = "\n" if isBlockLevel(elem.tag) and elem.tag not in ['code', 'pre']: if (not elem.text or not elem.text.strip()) \ @@ -904,8 +858,7 @@ class PrettifyPostprocessor(Postprocessor): elem.tail = i def run(self, root): - """ Add linebreaks to ElementTree root object """ - + """.Add linebreaks to ElementTree root object.""" self._prettifyETree(root) # Do
's seperately as they are often in the middle of # inline content and missed by _prettifyETree. @@ -965,11 +918,15 @@ AMPSUBSTITUTETEXTPOSTPROCESSOR = AndSubstitutePostprocessor() """ -====================================================================== -========================== MISC AUXILIARY CLASSES ==================== -====================================================================== +MISC AUXILIARY CLASSES +============================================================================= """ +class AtomicString(unicode): + """A string which should not be further processed.""" + pass + + class HtmlStash: """ This class is used for stashing HTML objects that we extract @@ -1004,98 +961,6 @@ class HtmlStash: self.html_counter = 0 self.rawHtmlBlocks = [] - -class BlockGuru: - """ Parse document for block level constructs (paragraphs, lists, etc.).""" - - def _findHead(self, lines, fn, allowBlank=0): - - """ - Functional magic to help determine boundaries of indented - blocks. - - Keyword arguments: - - * lines: an array of strings - * fn: a function that returns a substring of a string - if the string matches the necessary criteria - * allowBlank: specifies whether it's ok to have blank - lines between matching functions - - Returns: a list of post processes items and the unused - remainder of the original list - - """ - items = [] - item = -1 - - i = 0 # to keep track of where we are - - for line in lines: - - if not line.strip() and not allowBlank: - return items, lines[i:] - - if not line.strip() and allowBlank: - # If we see a blank line, this _might_ be the end - i += 1 - - # Find the next non-blank line - for j in range(i, len(lines)): - if lines[j].strip(): - next = lines[j] - break - else: - # There is no more text => this is the end - break - - # Check if the next non-blank line is still a part of the list - - part = fn(next) - - if part: - items.append("") - continue - else: - break # found end of the list - - part = fn(line) - - if part: - items.append(part) - i += 1 - continue - else: - return items, lines[i:] - else: - i += 1 - - return items, lines[i:] - - - def detabbed_fn(self, line): - """ An auxiliary method to be passed to _findHead """ - m = RE.regExp['tabbed'].match(line) - if m: - return m.group(4) - else: - return None - - - def detectTabbed(self, lines): - """ Find indented text and remove indent before further proccesing. """ - return self._findHead(lines, self.detabbed_fn, - allowBlank = 1) - - -def dequote(string): - """ Removes quotes from around a string """ - if ( ( string.startswith('"') and string.endswith('"')) - or (string.startswith("'") and string.endswith("'")) ): - return string[1:-1] - else: - return string - class InlineStash: @@ -1150,52 +1015,34 @@ class InlineStash: self._nodes = {} """ -====================================================================== -========================== CORE MARKDOWN ============================= -====================================================================== +CORE MARKDOWN +============================================================================= -This stuff is hard, so if you are thinking of extending the syntax, -see first if you can do it via pre-processors, post-processors, -inline patterns or a combination of the three. +The core part is still quite messy, despite substantial refactoring. If you +are thinking of extending the syntax, see first if you can do it through +pre-processors, post-processors, inline patterns or a combination of the three. """ -class CorePatterns: - """ - This class is scheduled for removal as part of a refactoring effort. - """ - - patterns = { - 'header': r'(#{1,6})[ \t]*(.*?)[ \t]*(#*)', # # A title - 'reference-def': r'(\ ?\ ?\ ?)\[([^\]]*)\]:\s*([^ ]*)(.*)', - # [Google]: http://www.google.com/ - 'containsline': r'([-]*)$|^([=]*)', # -----, =====, etc. - 'ol': r'[ ]{0,3}[\d]*\.\s+(.*)', # 1. text - 'ul': r'[ ]{0,3}[*+-]\s+(.*)', # "* text" - 'isline1': r'(\**)', # *** - 'isline2': r'(\-*)', # --- - 'isline3': r'(\_*)', # ___ - 'tabbed': r'((\t)|( ))(.*)', # an indented line - 'quoted': r'[ ]{0,2}> ?(.*)', # a quoted block ("> ...") - } - - def __init__ (self): - - self.regExp = {} - for key in self.patterns.keys(): - self.regExp[key] = re.compile("^%s$" % self.patterns[key], - re.DOTALL) - - self.regExp['containsline'] = re.compile(r'^([-]*)$|^([=]*)$', re.M) - self.regExp['attr'] = re.compile("\{@([^\}]*)=([^\}]*)}") # {@id=123} - -RE = CorePatterns() +def _wrapRe(raw_re) : return re.compile("^%s$" % raw_re, re.DOTALL) +CORE_RE = { + 'header': _wrapRe(r'(#{1,6})[ \t]*(.*?)[ \t]*(#*)'), # # A title + 'reference-def': _wrapRe(r'(\ ?\ ?\ ?)\[([^\]]*)\]:\s*([^ ]*)(.*)'), + # [Google]: http://www.google.com/ + 'containsline': _wrapRe(r'([-]*)$|^([=]*)'), # -----, =====, etc. + 'ol': _wrapRe(r'[ ]{0,3}[\d]*\.\s+(.*)'), # 1. text + 'ul': _wrapRe(r'[ ]{0,3}[*+-]\s+(.*)'), # "* text" + 'isline1': _wrapRe(r'(\**)'), # *** + 'isline2': _wrapRe(r'(\-*)'), # --- + 'isline3': _wrapRe(r'(\_*)'), # ___ + 'tabbed': _wrapRe(r'((\t)|( ))(.*)'), # an indented line + 'quoted': _wrapRe(r'[ ]{0,2}> ?(.*)'), # a quoted block ("> ...") + 'containsline': re.compile(r'^([-]*)$|^([=]*)$', re.M), + 'attr': re.compile("\{@([^\}]*)=([^\}]*)}") # {@id=123} +} class Markdown: - """ - Markdown formatter class for creating an html document from Markdown text. - """ - + """Converts markdown to HTML.""" def __init__(self, extensions=[], @@ -1216,7 +1063,6 @@ class Markdown: """ self.source = None self.safeMode = safe_mode - self.blockGuru = BlockGuru() self.registeredExtensions = [] self.docType = "" self.stripTopLevelTags = True @@ -1397,7 +1243,7 @@ class Markdown: 'tabbed': self._processCodeBlock} for regexp in ['ul', 'ol', 'quoted', 'tabbed']: - m = RE.regExp[regexp].match(lines[0]) + m = CORE_RE[regexp].match(lines[0]) if m: processFn[regexp](parent_elem, lines, inList) return @@ -1420,8 +1266,8 @@ class Markdown: if inList: start, lines = self._linesUntil(lines, (lambda line: - RE.regExp['ul'].match(line) - or RE.regExp['ol'].match(line) + CORE_RE['ul'].match(line) + or CORE_RE['ol'].match(line) or not line.strip())) self._processSection(parent_elem, start, @@ -1437,7 +1283,7 @@ class Markdown: self._processHeader(parent_elem, paragraph) elif len(paragraph) and \ - RE.regExp["isline3"].match(paragraph[0]): + CORE_RE["isline3"].match(paragraph[0]): self._processHR(parent_elem) lines = paragraph[1:] + lines @@ -1453,7 +1299,7 @@ class Markdown: hr = etree.SubElement(parentElem, "hr") def _processHeader(self, parentElem, paragraph): - m = RE.regExp['header'].match(paragraph[0]) + m = CORE_RE['header'].match(paragraph[0]) if m: level = len(m.group(1)) h = etree.SubElement(parentElem, "h%d" % level) @@ -1480,7 +1326,7 @@ class Markdown: # Searching for hr or header for line in paragraph: # it's hr - if RE.regExp["isline3"].match(line): + if CORE_RE["isline3"].match(line): el.text = "\n".join(dump) self._processHR(el) dump = [] @@ -1548,8 +1394,8 @@ class Markdown: # Check if the next non-blank line is still a part of the list - if ( RE.regExp[listexpr].match(next) or - RE.regExp['tabbed'].match(next) ): + if ( CORE_RE[listexpr].match(next) or + CORE_RE['tabbed'].match(next) ): # get rid of any white space in the line items[item].append(line.strip()) looseList = loose or looseList @@ -1562,7 +1408,7 @@ class Markdown: for expr in ['ul', 'ol', 'tabbed']: - m = RE.regExp[expr].match(line) + m = CORE_RE[expr].match(line) if m: if expr in ['ul', 'ol']: # We are looking at a new item #if m.group(1) : @@ -1628,7 +1474,7 @@ class Markdown: i = 0 blank_line = False # allow one blank line between paragraphs for line in lines: - m = RE.regExp['quoted'].match(line) + m = CORE_RE['quoted'].match(line) if m: dequoted.append(m.group(1)) i += 1 @@ -1667,7 +1513,7 @@ class Markdown: Returns: None """ - detabbed, theRest = self.blockGuru.detectTabbed(lines) + detabbed, theRest = self.detectTabbed(lines) pre = etree.SubElement(parentElem, "pre") code = etree.SubElement(pre, "code") @@ -1675,6 +1521,59 @@ class Markdown: text = "\n".join(detabbed).rstrip()+"\n" code.text = AtomicString(text) self._processSection(parentElem, theRest, inList) + + def detectTabbed(self, lines): + """ Find indented text and remove indent before further proccesing. + + Keyword arguments: + + * lines: an array of strings + * fn: a function that returns a substring of a string + if the string matches the necessary criteria + + Returns: a list of post processes items and the unused + remainder of the original list + + """ + items = [] + item = -1 + i = 0 # to keep track of where we are + + def detab(line): + match = CORE_RE['tabbed'].match(line) + if match: + return match.group(4) + + for line in lines: + if line.strip(): # Non-blank line + line = detab(line) + if line: + items.append(line) + i += 1 + continue + else: + return items, lines[i:] + + else: # Blank line: _maybe_ we are done. + i += 1 # advance + + # Find the next non-blank line + for j in range(i, len(lines)): + if lines[j].strip(): + next_line = lines[j]; break + else: + break # There is no more text; we are done. + + # Check if the next non-blank line is tabbed + if detab(next_line): # Yes, more work to do. + items.append("") + continue + else: + break # No, we are done. + else: + i += 1 + + return items, lines[i:] def _handleInline(self, data, patternIndex=0): """ @@ -1730,11 +1629,11 @@ class Markdown: if node is None: return data, True, len(leftData) + match.span(len(match.groups()))[0] - if not isstr(node): + if not isString(node): if not isinstance(node.text, AtomicString): # We need to process current node too for child in [node] + node.getchildren(): - if not isstr(node): + if not isString(node): if child.text: child.text = self._handleInline(child.text, patternIndex + 1) @@ -1824,14 +1723,14 @@ class Markdown: text = data[strartIndex:index] linkText(text) - if not isstr(node): # it's Element + if not isString(node): # it's Element for child in [node] + node.getchildren(): if child.tail: if child.tail.strip(): self._processElementText(node, child, False) - + if child.text: if child.text.strip(): self._processElementText(child, child) @@ -1872,35 +1771,26 @@ class Markdown: Returns: ElementTree object with applied inline patterns. """ - el = markdownTree.getroot() - - stack = [el] + stack = [markdownTree.getroot()] while stack: currElement = stack.pop() insertQueue = [] for child in currElement.getchildren(): - - if not isinstance(child.text, AtomicString) and child.text: - + if child.text and not isinstance(child.text, AtomicString): text = child.text child.text = None lst = self._processPlaceholders(self._handleInline( text), child) stack += lst - - insertQueue.append((child, lst)) - if child.getchildren(): stack.append(child) - for element, lst in insertQueue: if element.text: - element.text = handleAttributes(element.text, - element) + element.text = handleAttributes(element.text, element) i = 0 for newChild in lst: # Processing attributes @@ -1913,13 +1803,10 @@ class Markdown: element.insert(i, newChild) i += 1 - return markdownTree - def markdownToTree(self, source=None): - """ - Create ElementTree, without applying inline paterns. + """Create ElementTree, without applying inline paterns. Keyword arguments: @@ -1934,10 +1821,8 @@ class Markdown: return u"" # Fixup the source text - self.source = self.source.replace(STX, "") self.source = self.source.replace(ETX, "") - self.source = self.source.replace("\r\n", "\n").replace("\r", "\n") self.source += "\n\n" self.source = self.source.expandtabs(TAB_LENGTH) @@ -1950,8 +1835,7 @@ class Markdown: return markdownTree def convert (self, source): - """ - Create the document in XHTML format. + """Create the document in XHTML format. Keyword arguments: @@ -1986,10 +1870,8 @@ class Markdown: return xml.strip() - def __str__(self): - ''' Report info about instance. Markdown always returns unicode. ''' - + """ Report info about instance. Markdown always returns unicode.""" if self.source is None: status = 'in which no source text has been assinged.' else: @@ -2000,26 +1882,29 @@ class Markdown: __unicode__ = convert # markdown should always return a unicode string +""" +EXPORTED FUNCTIONS +============================================================================= - - -# ==================================================================== +Those are the two functions we really mean to export: markdown() and +markdownFromFile(). +""" def markdownFromFile(input = None, output = None, extensions = [], encoding = None, safe = False): - """ - Convenience wrapper function that takes a filename as input. + """Converts a markdown file and returns the HTML as a unicode string. Used from the command-line, although may be useful in other situations. Decodes the file using the provided encoding (defaults to utf-8), passes the file content to markdown, and outputs the html to either the provided filename or stdout in the same encoding as the source file. - **Note:** This is the only place that decoding and encoding takes place - in Python-Markdown. + **Note:** This is the only place that decoding and encoding of unicode + takes place in Python-Markdown. (All other code is unicode-in / + unicode-out.) Keyword arguments: @@ -2029,30 +1914,26 @@ def markdownFromFile(input = None, * encoding: Encoding of input and output files. Defaults to utf-8. * safe_mode: Disallow raw html. One of "remove", "replace" or "escape". - Returns: An HTML document as a string. - """ + + encoding = encoding or "utf-8" - message(DEBUG, "input file: %s" % input) - - if not encoding: - encoding = "utf-8" - + # Read the source input_file = codecs.open(input, mode="r", encoding=encoding) text = input_file.read() input_file.close() + text = text.lstrip(u'\ufeff') # remove the byte-order mark - text = removeBOM(text, encoding) - - new_text = markdown(text, extensions, safe_mode = safe) + # Convert + html = markdown(text, extensions, safe_mode = safe) + # Write to file or stdout if output: output_file = codecs.open(output, "w", encoding=encoding) - output_file.write(new_text) + output_file.write(html) output_file.close() - else: - sys.stdout.write(new_text.encode(encoding)) + sys.stdout.write(html.encode(encoding)) def markdown(text, extensions = [], @@ -2082,11 +1963,15 @@ def markdown(text, return md.convert(text) +""" +Extensions +----------------------------------------------------------------------------- +""" + class Extension: """ Base class for extensions to subclass. """ def __init__(self, configs = {}): - """ - Create an instance of an Extention. + """Create an instance of an Extention. Keyword arguments: @@ -2169,9 +2054,19 @@ def load_extension(ext_name, configs = []): return module.makeExtension(configs.items()) -############################################################################# -## Only command-line specific stuff from here down. -############################################################################# +# Extensions should use "markdown.etree" instead of "etree" (or do `from +# markdown import etree`). Do not import it by yourself. + +etree = importETree() + + +""" +COMMAND-LINE SPECIFIC STUFF +============================================================================= + +The rest of the code is specifically for handling the case where Python +Markdown is called from the command line. +""" OPTPARSE_WARNING = """ Python 2.3 or higher required for advanced command line options. diff --git a/markdown_extensions/codehilite.py b/markdown_extensions/codehilite.py index a96aaaa..7f4a1a8 100644 --- a/markdown_extensions/codehilite.py +++ b/markdown_extensions/codehilite.py @@ -208,7 +208,7 @@ class CodeHiliteExtention(markdown.Extension): """ - detabbed, theRest = md.blockGuru.detectTabbed(lines) + detabbed, theRest = md.detectTabbed(lines) text = "\n".join(detabbed).rstrip()+"\n" code = CodeHilite(text, linenos=self.config['force_linenos'][0], css_class=self.config['css_class'][0]) -- cgit v1.2.3