From 8b6b7b0a39321dadfcab4d0a16053377c4715bee Mon Sep 17 00:00:00 2001 From: Yuri Takhteyev Date: Sun, 12 Oct 2008 19:37:20 -0700 Subject: Refactored markdown tree traversing logic into a separate class (InlineProcessor). --- markdown.py | 391 +++++++++++++++++++++++++++++------------------------------- 1 file changed, 189 insertions(+), 202 deletions(-) (limited to 'markdown.py') diff --git a/markdown.py b/markdown.py index 0aa530d..ae8dc10 100755 --- a/markdown.py +++ b/markdown.py @@ -1401,28 +1401,32 @@ class HtmlStash: self.html_counter += 1 return placeholder - def rest(self): + def reset(self): self.html_counter = 0 self.rawHtmlBlocks = [] - -class InlineStash: - - def __init__(self): - """ Create a InlineStash. """ - self.prefix = INLINE_PLACEHOLDER_PREFIX - self.suffix = ETX - self._nodes = {} - self.phLength = 4 + len(self.prefix) + len(self.suffix) - self._placeholder_re = re.compile(INLINE_PLACEHOLDER % r'([0-9]{4})') + +class InlineProcessor: + """ + An auxiliary class to traverse a Markdown tree, applying inline patterns. + """ + + def __init__ (self, patterns): + self.inlinePatterns = patterns + + self.__placeholder_prefix = INLINE_PLACEHOLDER_PREFIX + self.__placeholder_suffix = ETX + self.__placeholder_length = 4 + len(self.__placeholder_prefix) \ + + len(self.__placeholder_suffix) + self.__placeholder_re = re.compile(INLINE_PLACEHOLDER % r'([0-9]{4})') - def _genPlaceholder(self, type): + def __makePlaceholder(self, type): """ Generate a placeholder """ - id = "%04d" % len(self._nodes) + id = "%04d" % len(self.stashed_nodes) hash = INLINE_PLACEHOLDER % id return hash, id - def extractId(self, data, index): + def __findPlaceholder(self, data, index): """ Extract id from data string, start from index @@ -1434,164 +1438,19 @@ class InlineStash: Returns: placeholder id and string index, after found placeholder """ - m = self._placeholder_re.search(data, index) + m = self.__placeholder_re.search(data, index) if m: return m.group(1), m.end() else: return None, index + 1 - def isin(self, id): - """ Check if node with given id exists in stash """ - return self._nodes.has_key(id) - - def get(self, id): - """ Return node by id """ - return self._nodes.get(id) - - def add(self, node, type): + def __stashNode(self, node, type): """ Add node to stash """ - pholder, id = self._genPlaceholder(type) - self._nodes[id] = node - return pholder - - def rest(self): - """ Reset instance """ - self._nodes = {} + placeholder, id = self.__makePlaceholder(type) + self.stashed_nodes[id] = node + return placeholder - - -class Markdown: - """Convert Markdown to HTML.""" - - def __init__(self, - extensions=[], - extension_configs={}, - safe_mode = False): - """ - Creates a new Markdown instance. - - Keyword arguments: - - * extensions: A list of extensions. - If they are of type string, the module mdx_name.py will be loaded. - If they are a subclass of markdown.Extension, they will be used - as-is. - * extension-configs: Configuration setting for extensions. - * safe_mode: Disallow raw html. One of "remove", "replace" or "escape". - - """ - self.parser = MarkdownParser() - self.safeMode = safe_mode - self.registeredExtensions = [] - self.docType = "" - self.stripTopLevelTags = True - - self.textPreprocessors = [HTML_BLOCK_PREPROCESSOR] - - self.preprocessors = [HEADER_PREPROCESSOR, - LINE_PREPROCESSOR, - # A footnote preprocessor will - # get inserted here - REFERENCE_PREPROCESSOR] - - - self.postprocessors = [PRETTIFYPOSTPROCESSOR, - # a footnote postprocessor will get - # inserted later - ] - - self.textPostprocessors = [# a footnote postprocessor will get - # inserted here - RAWHTMLTEXTPOSTPROCESSOR, - AMPSUBSTITUTETEXTPOSTPROCESSOR] - - self.prePatterns = [] - - self.inlinePatterns = [ - BACKTICK_PATTERN, - ESCAPE_PATTERN, - REFERENCE_PATTERN, - LINK_PATTERN, - IMAGE_LINK_PATTERN, - IMAGE_REFERENCE_PATTERN, - AUTOLINK_PATTERN, - AUTOMAIL_PATTERN, - LINE_BREAK_PATTERN_2, - LINE_BREAK_PATTERN, - HTML_PATTERN, - ENTITY_PATTERN, - NOT_STRONG_PATTERN, - STRONG_EM_PATTERN, - STRONG_PATTERN, - EMPHASIS_PATTERN, - EMPHASIS_PATTERN_2 - # The order of the handlers matters!!! - ] - - self.inlineStash = InlineStash() - self.references = {} - self.htmlStash = HtmlStash() - - - self.registerExtensions(extensions = extensions, - configs = extension_configs) - - self.reset() - - - def registerExtensions(self, extensions, configs): - """ - Register extensions with this instance of Markdown. - - Keyword aurguments: - - * extensions: A list of extensions, which can either - be strings or objects. See the docstring on Markdown. - * configs: A dictionary mapping module names to config options. - - """ - for ext in extensions: - if isinstance(ext, basestring): - ext = load_extension(ext, configs.get(ext, [])) - elif hasattr(ext, 'extendMarkdown'): - # Looks like an Extension. - # Nothing to do here. - pass - else: - message(ERROR, "Incorrect type! Extension '%s' is " - "neither a string or an Extension." %(repr(ext))) - continue - ext.extendMarkdown(self, globals()) - - def registerExtension(self, extension): - """ This gets called by the extension """ - self.registeredExtensions.append(extension) - - def reset(self): - """ - Resets all state variables so that we can start with a new text. - """ - self.inlineStash.rest() - self.htmlStash.rest() - self.references.clear() - - HTML_BLOCK_PREPROCESSOR.stash = self.htmlStash - LINE_PREPROCESSOR.stash = self.htmlStash - REFERENCE_PREPROCESSOR.references = self.references - HTML_PATTERN.stash = self.htmlStash - ENTITY_PATTERN.stash = self.htmlStash - REFERENCE_PATTERN.references = self.references - IMAGE_REFERENCE_PATTERN.references = self.references - RAWHTMLTEXTPOSTPROCESSOR.stash = self.htmlStash - RAWHTMLTEXTPOSTPROCESSOR.safeMode = self.safeMode - - for extension in self.registeredExtensions: - extension.reset() - - for pattern in self.inlinePatterns: - pattern.safe_mode = self.safeMode - - def _handleInline(self, data, patternIndex=0): + def __handleInline(self, data, patternIndex=0): """ Process string with inline patterns and replace it with placeholders @@ -1604,23 +1463,20 @@ class Markdown: Returns: String with placeholders. """ - if isinstance(data, AtomicString): - return data - - startIndex = 0 - while patternIndex < len(self.inlinePatterns): - data, matched, startIndex = self._applyInline( - self.inlinePatterns[patternIndex], - data, patternIndex, startIndex) - if not matched: - patternIndex += 1 + if not isinstance(data, AtomicString): + startIndex = 0 + while patternIndex < len(self.inlinePatterns): + data, matched, startIndex = self.__applyPattern( + self.inlinePatterns[patternIndex], + data, patternIndex, startIndex) + if not matched: + patternIndex += 1 return data - - def _processElementText(self, node, subnode, isText=True): + def __processElementText(self, node, subnode, isText=True): """ Process placeholders in Element.text or Element.tail - of Elements popped from InlineStash + of Elements popped from self.stashed_nodes. Keywords arguments: @@ -1638,7 +1494,7 @@ class Markdown: text = subnode.tail subnode.tail = None - childResult = self._processPlaceholders(text, subnode) + childResult = self.__processPlaceholders(text, subnode) if not isText and node is not subnode: pos = node.getchildren().index(subnode) @@ -1650,7 +1506,7 @@ class Markdown: for newChild in childResult: node.insert(pos, newChild) - def _processPlaceholders(self, data, parent): + def __processPlaceholders(self, data, parent): """ Process string with placeholders and generate ElementTree tree. @@ -1675,16 +1531,14 @@ class Markdown: parent.text = text result = [] - prefix = self.inlineStash.prefix - strartIndex = 0 - + strartIndex = 0 while data: - index = data.find(prefix, strartIndex) + index = data.find(self.__placeholder_prefix, strartIndex) if index != -1: - id, phEndIndex = self.inlineStash.extractId(data, index) + id, phEndIndex = self.__findPlaceholder(data, index) - if self.inlineStash.isin(id): - node = self.inlineStash.get(id) + if self.stashed_nodes.has_key(id): + node = self.stashed_nodes.get(id) if index > 0: text = data[strartIndex:index] @@ -1694,10 +1548,10 @@ class Markdown: for child in [node] + node.getchildren(): if child.tail: if child.tail.strip(): - self._processElementText(node, child, False) + self.__processElementText(node, child, False) if child.text: if child.text.strip(): - self._processElementText(child, child) + self.__processElementText(child, child) else: # it's just a string linkText(node) strartIndex = phEndIndex @@ -1718,10 +1572,10 @@ class Markdown: return result - def _applyInline(self, pattern, data, patternIndex, startIndex=0): + def __applyPattern(self, pattern, data, patternIndex, startIndex=0): """ Check if the line fits the pattern, create the necessary - elements, add it to InlineStash + elements, add it to stashed_nodes. Keyword arguments: @@ -1749,17 +1603,17 @@ class Markdown: for child in [node] + node.getchildren(): if not isString(node): if child.text: - child.text = self._handleInline(child.text, + child.text = self.__handleInline(child.text, patternIndex + 1) if child.tail: - child.tail = self._handleInline(child.tail, + child.tail = self.__handleInline(child.tail, patternIndex) - pholder = self.inlineStash.add(node, pattern.type()) + placeholder = self.__stashNode(node, pattern.type()) return "%s%s%s%s" % (leftData, match.group(1), - pholder, match.groups()[-1]), True, 0 + placeholder, match.groups()[-1]), True, 0 def applyInlinePatterns(self, markdownTree): @@ -1777,6 +1631,8 @@ class Markdown: Returns: ElementTree object with applied inline patterns. """ + self.stashed_nodes = {} + stack = [markdownTree.getroot()] while stack: @@ -1786,7 +1642,7 @@ class Markdown: if child.text and not isinstance(child.text, AtomicString): text = child.text child.text = None - lst = self._processPlaceholders(self._handleInline( + lst = self.__processPlaceholders(self.__handleInline( text), child) stack += lst insertQueue.append((child, lst)) @@ -1811,6 +1667,139 @@ class Markdown: return markdownTree + + + +class Markdown: + """Convert Markdown to HTML.""" + + def __init__(self, + extensions=[], + extension_configs={}, + safe_mode = False): + """ + Creates a new Markdown instance. + + Keyword arguments: + + * extensions: A list of extensions. + If they are of type string, the module mdx_name.py will be loaded. + If they are a subclass of markdown.Extension, they will be used + as-is. + * extension-configs: Configuration setting for extensions. + * safe_mode: Disallow raw html. One of "remove", "replace" or "escape". + + """ + self.parser = MarkdownParser() + self.safeMode = safe_mode + self.registeredExtensions = [] + self.docType = "" + self.stripTopLevelTags = True + + self.textPreprocessors = [HTML_BLOCK_PREPROCESSOR] + + self.preprocessors = [HEADER_PREPROCESSOR, + LINE_PREPROCESSOR, + # A footnote preprocessor will + # get inserted here + REFERENCE_PREPROCESSOR] + + + self.postprocessors = [PRETTIFYPOSTPROCESSOR, + # a footnote postprocessor will get + # inserted later + ] + + self.textPostprocessors = [# a footnote postprocessor will get + # inserted here + RAWHTMLTEXTPOSTPROCESSOR, + AMPSUBSTITUTETEXTPOSTPROCESSOR] + + self.prePatterns = [] + + self.inlinePatterns = [ + BACKTICK_PATTERN, + ESCAPE_PATTERN, + REFERENCE_PATTERN, + LINK_PATTERN, + IMAGE_LINK_PATTERN, + IMAGE_REFERENCE_PATTERN, + AUTOLINK_PATTERN, + AUTOMAIL_PATTERN, + LINE_BREAK_PATTERN_2, + LINE_BREAK_PATTERN, + HTML_PATTERN, + ENTITY_PATTERN, + NOT_STRONG_PATTERN, + STRONG_EM_PATTERN, + STRONG_PATTERN, + EMPHASIS_PATTERN, + EMPHASIS_PATTERN_2 + # The order of the handlers matters!!! + ] + + self.inlineProcessor = InlineProcessor(self.inlinePatterns) + self.references = {} + self.htmlStash = HtmlStash() + + + self.registerExtensions(extensions = extensions, + configs = extension_configs) + + self.reset() + + + def registerExtensions(self, extensions, configs): + """ + Register extensions with this instance of Markdown. + + Keyword aurguments: + + * extensions: A list of extensions, which can either + be strings or objects. See the docstring on Markdown. + * configs: A dictionary mapping module names to config options. + + """ + for ext in extensions: + if isinstance(ext, basestring): + ext = load_extension(ext, configs.get(ext, [])) + elif hasattr(ext, 'extendMarkdown'): + # Looks like an Extension. + # Nothing to do here. + pass + else: + message(ERROR, "Incorrect type! Extension '%s' is " + "neither a string or an Extension." %(repr(ext))) + continue + ext.extendMarkdown(self, globals()) + + def registerExtension(self, extension): + """ This gets called by the extension """ + self.registeredExtensions.append(extension) + + def reset(self): + """ + Resets all state variables so that we can start with a new text. + """ + self.htmlStash.reset() + self.references.clear() + + HTML_BLOCK_PREPROCESSOR.stash = self.htmlStash + LINE_PREPROCESSOR.stash = self.htmlStash + REFERENCE_PREPROCESSOR.references = self.references + HTML_PATTERN.stash = self.htmlStash + ENTITY_PATTERN.stash = self.htmlStash + REFERENCE_PATTERN.references = self.references + IMAGE_REFERENCE_PATTERN.references = self.references + RAWHTMLTEXTPOSTPROCESSOR.stash = self.htmlStash + RAWHTMLTEXTPOSTPROCESSOR.safeMode = self.safeMode + + for extension in self.registeredExtensions: + extension.reset() + + for pattern in self.inlinePatterns: + pattern.safe_mode = self.safeMode + def convert (self, source): """Convert markdown to serialized XHTML.""" @@ -1823,10 +1812,8 @@ class Markdown: message(CRITICAL, 'UnicodeDecodeError: Markdown only accepts unicode or ascii input.') return u"" - source = source.replace(STX, "") - source = source.replace(ETX, "") - source = source.replace("\r\n", "\n").replace("\r", "\n") - source += "\n\n" + source = source.replace(STX, "").replace(ETX, "") + source = source.replace("\r\n", "\n").replace("\r", "\n") + "\n\n" source = source.expandtabs(TAB_LENGTH) # Run the text preprocessors @@ -1841,7 +1828,8 @@ class Markdown: # Parse the high-level elements. tree = self.parser.parseDocument(self.lines) - root = self.applyInlinePatterns(tree).getroot() + # Apply inline patterns + root = self.inlineProcessor.applyInlinePatterns(tree).getroot() # Run the post-processors for postprocessor in self.postprocessors: @@ -2042,7 +2030,6 @@ def markdownFromFile(input = None, encoding = None, safe = False): - md = Markdown(extensions=load_extensions(extensions), safe_mode = safe_mode) md.convertFile(input, output, encoding) -- cgit v1.2.3