From 517d38e552e91ebbe527a0286d43dd1daa585bcc Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Thu, 20 Nov 2008 19:38:09 -0500 Subject: Cleaned up recent refactor into a package from a single file. --- markdown/__init__.py | 163 ++++++++++++++++---------------- markdown/linepreprocessors.py | 214 ------------------------------------------ markdown/postprocessors.py | 18 +++- markdown/preprocessors.py | 214 ++++++++++++++++++++++++++++++++++++++++++ markdown/treeprocessors.py | 24 +++-- 5 files changed, 327 insertions(+), 306 deletions(-) mode change 100755 => 100644 markdown/__init__.py delete mode 100644 markdown/linepreprocessors.py create mode 100644 markdown/preprocessors.py diff --git a/markdown/__init__.py b/markdown/__init__.py old mode 100755 new mode 100644 index 27cb9ff..8eacd45 --- a/markdown/__init__.py +++ b/markdown/__init__.py @@ -64,8 +64,8 @@ ENABLE_ATTRIBUTES = True # @id = xyz -> <... id="xyz"> SMART_EMPHASIS = True # this_or_that does not become thisorthat HTML_REMOVED_TEXT = "[HTML_REMOVED]" # text used instead of HTML in safe mode BLOCK_LEVEL_ELEMENTS = re.compile("p|div|h[1-6]|blockquote|pre|table|dl|ol|ul" - +"|script|noscript|form|fieldset|iframe|math" - +"|ins|del|hr|hr/|style|li|dt|dd|tr") + "|script|noscript|form|fieldset|iframe|math" + "|ins|del|hr|hr/|style|li|dt|dd|tr") # Placeholders STX = u'\u0002' # Use STX ("Start of text") for start-of-placeholder @@ -74,12 +74,18 @@ INLINE_PLACEHOLDER_PREFIX = STX+"klzzwxh:" INLINE_PLACEHOLDER = INLINE_PLACEHOLDER_PREFIX + "%s" + ETX AMP_SUBSTITUTE = STX+"amp"+ETX -import linepreprocessors, blockprocessors, treeprocessors, inlinepatterns +import preprocessors, blockprocessors, treeprocessors, inlinepatterns import postprocessors import blockparser import etree_loader import odict +# Extensions should use "markdown.etree" instead of "etree" (or do `from +# markdown import etree`). Do not import it by yourself. + +etree = etree_loader.importETree() + + """ Constants you probably do not need to change ----------------------------------------------------------------------------- @@ -108,6 +114,15 @@ def isBlockLevel(tag): """Check if the tag is a block level HTML tag.""" return BLOCK_LEVEL_ELEMENTS.match(tag) +""" +MISC AUXILIARY CLASSES +============================================================================= +""" + +class AtomicString(unicode): + """A string which should not be further processed.""" + pass + """ OVERALL DESIGN @@ -127,46 +142,9 @@ Markdown processing takes place in four steps: Those steps are put together by the Markdown() class. -The code below is organized as follows: - -1. BlockParser and it's BlockProcessors - does core block parsing. -2. All the preprocessors, patterns, treeprocessors, and postprocessors. -3. Markdown class - does the high-level wrapping. -""" - - - - - -""" -POST-PROCESSORS -============================================================================= - -Markdown also allows post-processors, which are similar to preprocessors in -that they need to implement a "run" method. However, they are run after core -processing. - -There are two types of post-processors: Treeprocessor and Postprocessor """ - - -""" -MISC AUXILIARY CLASSES -============================================================================= -""" - -class AtomicString(unicode): - """A string which should not be further processed.""" - pass - - -""" -Markdown -============================================================================= -""" - class Markdown: """Convert Markdown to HTML.""" @@ -195,66 +173,93 @@ class Markdown: # Preprocessors self.preprocessors = odict.OrderedDict() - self.preprocessors["html_block"] = linepreprocessors.HtmlBlockPreprocessor(self) - self.preprocessors["reference"] = linepreprocessors.ReferencePreprocessor(self) + self.preprocessors["html_block"] = \ + preprocessors.HtmlBlockPreprocessor(self) + self.preprocessors["reference"] = \ + preprocessors.ReferencePreprocessor(self) # footnote preprocessor will be inserted with "amp_substitute" self.references = {} - self.htmlStash = linepreprocessors.HtmlStash() + self.htmlStash = preprocessors.HtmlStash() self.registerExtensions(extensions = extensions, configs = extension_configs) self.reset() @@ -427,6 +432,7 @@ class Extension: """ pass + def load_extension(ext_name, configs = []): """Load extension by name, then return the module. @@ -466,6 +472,7 @@ def load_extension(ext_name, configs = []): except: message(CRITICAL, "Failed to instantiate extension '%s'" % ext_name) + def load_extensions(ext_names): """Loads multiple extensions""" extensions = [] @@ -475,10 +482,6 @@ def load_extensions(ext_names): extensions.append(extension) return extensions -# Extensions should use "markdown.etree" instead of "etree" (or do `from -# markdown import etree`). Do not import it by yourself. - -etree = etree_loader.importETree() """ EXPORTED FUNCTIONS diff --git a/markdown/linepreprocessors.py b/markdown/linepreprocessors.py deleted file mode 100644 index 712a1e8..0000000 --- a/markdown/linepreprocessors.py +++ /dev/null @@ -1,214 +0,0 @@ - -""" -PRE-PROCESSORS -============================================================================= - -Preprocessors work on source text before we start doing anything too -complicated. -""" - -import re -import markdown - -HTML_PLACEHOLDER_PREFIX = markdown.STX+"wzxhzdk:" -HTML_PLACEHOLDER = HTML_PLACEHOLDER_PREFIX + "%d" + markdown.ETX - -class Processor: - def __init__(self, markdown_instance=None): - if markdown_instance: - self.markdown = markdown_instance - -class Preprocessor (Processor): - """ - Preprocessors are run after the text is broken into lines. - - Each preprocessor implements a "run" method that takes a pointer to a - list of lines of the document, modifies it as necessary and returns - either the same pointer or a pointer to a new list. - - Preprocessors must extend markdown.Preprocessor. - - """ - def run(self, lines): - """ - Each subclass of Preprocessor should override the `run` method, which - takes the document as a list of strings split by newlines and returns - the (possibly modified) list of lines. - - """ - pass - -class HtmlStash: - """ - This class is used for stashing HTML objects that we extract - in the beginning and replace with place-holders. - """ - - def __init__ (self): - """ Create a HtmlStash. """ - self.html_counter = 0 # for counting inline html segments - self.rawHtmlBlocks=[] - - def store(self, html, safe=False): - """ - Saves an HTML segment for later reinsertion. Returns a - placeholder string that needs to be inserted into the - document. - - Keyword arguments: - - * html: an html segment - * safe: label an html segment as safe for safemode - - Returns : a placeholder string - - """ - self.rawHtmlBlocks.append((html, safe)) - placeholder = HTML_PLACEHOLDER % self.html_counter - self.html_counter += 1 - return placeholder - - def reset(self): - self.html_counter = 0 - self.rawHtmlBlocks = [] - - -class HtmlBlockPreprocessor(Preprocessor): - """Remove html blocks from the text and store them for later retrieval.""" - - right_tag_patterns = ["", "%s>"] - - def _get_left_tag(self, block): - return block[1:].replace(">", " ", 1).split()[0].lower() - - def _get_right_tag(self, left_tag, block): - for p in self.right_tag_patterns: - tag = p % left_tag - i = block.rfind(tag) - if i > 2: - return tag.lstrip("<").rstrip(">"), i + len(p)-2 + len(left_tag) - return block.rstrip()[-len(left_tag)-2:-1].lower(), len(block) - - def _equal_tags(self, left_tag, right_tag): - if left_tag == 'div' or left_tag[0] in ['?', '@', '%']: # handle PHP, etc. - return True - if ("/" + left_tag) == right_tag: - return True - if (right_tag == "--" and left_tag == "--"): - return True - elif left_tag == right_tag[1:] \ - and right_tag[0] != "<": - return True - else: - return False - - def _is_oneliner(self, tag): - return (tag in ['hr', 'hr/']) - - def run(self, lines): - text = "\n".join(lines) - new_blocks = [] - text = text.split("\n\n") - items = [] - left_tag = '' - right_tag = '' - in_tag = False # flag - - while text: - block = text[0] - if block.startswith("\n"): - block = block[1:] - text = text[1:] - - if block.startswith("\n"): - block = block[1:] - - if not in_tag: - if block.startswith("<"): - left_tag = self._get_left_tag(block) - right_tag, data_index = self._get_right_tag(left_tag, block) - - if data_index < len(block): - text.insert(0, block[data_index:]) - block = block[:data_index] - - if not (markdown.isBlockLevel(left_tag) \ - or block[1] in ["!", "?", "@", "%"]): - new_blocks.append(block) - continue - - if self._is_oneliner(left_tag): - new_blocks.append(block.strip()) - continue - - if block[1] == "!": - # is a comment block - left_tag = "--" - right_tag, data_index = self._get_right_tag(left_tag, block) - # keep checking conditions below and maybe just append - - if block.rstrip().endswith(">") \ - and self._equal_tags(left_tag, right_tag): - new_blocks.append( - self.markdown.htmlStash.store(block.strip())) - continue - else: #if not block[1] == "!": - # if is block level tag and is not complete - - if markdown.isBlockLevel(left_tag) or left_tag == "--" \ - and not block.rstrip().endswith(">"): - items.append(block.strip()) - in_tag = True - else: - new_blocks.append( - self.markdown.htmlStash.store(block.strip())) - - continue - - new_blocks.append(block) - - else: - items.append(block.strip()) - - right_tag, data_index = self._get_right_tag(left_tag, block) - - if self._equal_tags(left_tag, right_tag): - # if find closing tag - in_tag = False - new_blocks.append( - self.markdown.htmlStash.store('\n\n'.join(items))) - items = [] - - if items: - new_blocks.append(self.markdown.htmlStash.store('\n\n'.join(items))) - new_blocks.append('\n') - - new_text = "\n\n".join(new_blocks) - return new_text.split("\n") - - -class ReferencePreprocessor(Preprocessor): - """ Remove reference definitions from text and store for later use. """ - - RE = re.compile(r'^(\ ?\ ?\ ?)\[([^\]]*)\]:\s*([^ ]*)(.*)$', re.DOTALL) - - def run (self, lines): - new_text = []; - for line in lines: - m = self.RE.match(line) - if m: - id = m.group(2).strip().lower() - t = m.group(4).strip() # potential title - if not t: - self.markdown.references[id] = (m.group(3), t) - elif (len(t) >= 2 - and (t[0] == t[-1] == "\"" - or t[0] == t[-1] == "\'" - or (t[0] == "(" and t[-1] == ")") ) ): - self.markdown.references[id] = (m.group(3), t[1:-1]) - else: - new_text.append(line) - else: - new_text.append(line) - - return new_text #+ "\n" diff --git a/markdown/postprocessors.py b/markdown/postprocessors.py index cd872cf..80227bb 100644 --- a/markdown/postprocessors.py +++ b/markdown/postprocessors.py @@ -1,3 +1,13 @@ +""" +POST-PROCESSORS +============================================================================= + +Markdown also allows post-processors, which are similar to preprocessors in +that they need to implement a "run" method. However, they are run after core +processing. + +""" + import markdown @@ -42,9 +52,11 @@ class RawHtmlPostprocessor(Postprocessor): else: html = markdown.HTML_REMOVED_TEXT if safe or not self.markdown.safeMode: - text = text.replace("

%s

" % (markdown.linepreprocessors.HTML_PLACEHOLDER % i), - html + "\n") - text = text.replace(markdown.linepreprocessors.HTML_PLACEHOLDER % i, html) + text = text.replace("

%s

" % + (markdown.preprocessors.HTML_PLACEHOLDER % i), + html + "\n") + text = text.replace(markdown.preprocessors.HTML_PLACEHOLDER % i, + html) return text def escape(self, html): diff --git a/markdown/preprocessors.py b/markdown/preprocessors.py new file mode 100644 index 0000000..712a1e8 --- /dev/null +++ b/markdown/preprocessors.py @@ -0,0 +1,214 @@ + +""" +PRE-PROCESSORS +============================================================================= + +Preprocessors work on source text before we start doing anything too +complicated. +""" + +import re +import markdown + +HTML_PLACEHOLDER_PREFIX = markdown.STX+"wzxhzdk:" +HTML_PLACEHOLDER = HTML_PLACEHOLDER_PREFIX + "%d" + markdown.ETX + +class Processor: + def __init__(self, markdown_instance=None): + if markdown_instance: + self.markdown = markdown_instance + +class Preprocessor (Processor): + """ + Preprocessors are run after the text is broken into lines. + + Each preprocessor implements a "run" method that takes a pointer to a + list of lines of the document, modifies it as necessary and returns + either the same pointer or a pointer to a new list. + + Preprocessors must extend markdown.Preprocessor. + + """ + def run(self, lines): + """ + Each subclass of Preprocessor should override the `run` method, which + takes the document as a list of strings split by newlines and returns + the (possibly modified) list of lines. + + """ + pass + +class HtmlStash: + """ + This class is used for stashing HTML objects that we extract + in the beginning and replace with place-holders. + """ + + def __init__ (self): + """ Create a HtmlStash. """ + self.html_counter = 0 # for counting inline html segments + self.rawHtmlBlocks=[] + + def store(self, html, safe=False): + """ + Saves an HTML segment for later reinsertion. Returns a + placeholder string that needs to be inserted into the + document. + + Keyword arguments: + + * html: an html segment + * safe: label an html segment as safe for safemode + + Returns : a placeholder string + + """ + self.rawHtmlBlocks.append((html, safe)) + placeholder = HTML_PLACEHOLDER % self.html_counter + self.html_counter += 1 + return placeholder + + def reset(self): + self.html_counter = 0 + self.rawHtmlBlocks = [] + + +class HtmlBlockPreprocessor(Preprocessor): + """Remove html blocks from the text and store them for later retrieval.""" + + right_tag_patterns = ["", "%s>"] + + def _get_left_tag(self, block): + return block[1:].replace(">", " ", 1).split()[0].lower() + + def _get_right_tag(self, left_tag, block): + for p in self.right_tag_patterns: + tag = p % left_tag + i = block.rfind(tag) + if i > 2: + return tag.lstrip("<").rstrip(">"), i + len(p)-2 + len(left_tag) + return block.rstrip()[-len(left_tag)-2:-1].lower(), len(block) + + def _equal_tags(self, left_tag, right_tag): + if left_tag == 'div' or left_tag[0] in ['?', '@', '%']: # handle PHP, etc. + return True + if ("/" + left_tag) == right_tag: + return True + if (right_tag == "--" and left_tag == "--"): + return True + elif left_tag == right_tag[1:] \ + and right_tag[0] != "<": + return True + else: + return False + + def _is_oneliner(self, tag): + return (tag in ['hr', 'hr/']) + + def run(self, lines): + text = "\n".join(lines) + new_blocks = [] + text = text.split("\n\n") + items = [] + left_tag = '' + right_tag = '' + in_tag = False # flag + + while text: + block = text[0] + if block.startswith("\n"): + block = block[1:] + text = text[1:] + + if block.startswith("\n"): + block = block[1:] + + if not in_tag: + if block.startswith("<"): + left_tag = self._get_left_tag(block) + right_tag, data_index = self._get_right_tag(left_tag, block) + + if data_index < len(block): + text.insert(0, block[data_index:]) + block = block[:data_index] + + if not (markdown.isBlockLevel(left_tag) \ + or block[1] in ["!", "?", "@", "%"]): + new_blocks.append(block) + continue + + if self._is_oneliner(left_tag): + new_blocks.append(block.strip()) + continue + + if block[1] == "!": + # is a comment block + left_tag = "--" + right_tag, data_index = self._get_right_tag(left_tag, block) + # keep checking conditions below and maybe just append + + if block.rstrip().endswith(">") \ + and self._equal_tags(left_tag, right_tag): + new_blocks.append( + self.markdown.htmlStash.store(block.strip())) + continue + else: #if not block[1] == "!": + # if is block level tag and is not complete + + if markdown.isBlockLevel(left_tag) or left_tag == "--" \ + and not block.rstrip().endswith(">"): + items.append(block.strip()) + in_tag = True + else: + new_blocks.append( + self.markdown.htmlStash.store(block.strip())) + + continue + + new_blocks.append(block) + + else: + items.append(block.strip()) + + right_tag, data_index = self._get_right_tag(left_tag, block) + + if self._equal_tags(left_tag, right_tag): + # if find closing tag + in_tag = False + new_blocks.append( + self.markdown.htmlStash.store('\n\n'.join(items))) + items = [] + + if items: + new_blocks.append(self.markdown.htmlStash.store('\n\n'.join(items))) + new_blocks.append('\n') + + new_text = "\n\n".join(new_blocks) + return new_text.split("\n") + + +class ReferencePreprocessor(Preprocessor): + """ Remove reference definitions from text and store for later use. """ + + RE = re.compile(r'^(\ ?\ ?\ ?)\[([^\]]*)\]:\s*([^ ]*)(.*)$', re.DOTALL) + + def run (self, lines): + new_text = []; + for line in lines: + m = self.RE.match(line) + if m: + id = m.group(2).strip().lower() + t = m.group(4).strip() # potential title + if not t: + self.markdown.references[id] = (m.group(3), t) + elif (len(t) >= 2 + and (t[0] == t[-1] == "\"" + or t[0] == t[-1] == "\'" + or (t[0] == "(" and t[-1] == ")") ) ): + self.markdown.references[id] = (m.group(3), t[1:-1]) + else: + new_text.append(line) + else: + new_text.append(line) + + return new_text #+ "\n" diff --git a/markdown/treeprocessors.py b/markdown/treeprocessors.py index e8d7cd0..0ea0de2 100644 --- a/markdown/treeprocessors.py +++ b/markdown/treeprocessors.py @@ -277,26 +277,31 @@ class InlineProcessor(Treeprocessor): for element, lst in insertQueue: if element.text: - element.text = markdown.inlinepatterns.handleAttributes(element.text, element) + element.text = \ + markdown.inlinepatterns.handleAttributes(element.text, + element) i = 0 for newChild in lst: # Processing attributes if newChild.tail: - newChild.tail = markdown.inlinepatterns.handleAttributes(newChild.tail, - element) + newChild.tail = \ + markdown.inlinepatterns.handleAttributes(newChild.tail, + element) if newChild.text: - newChild.text = markdown.inlinepatterns.handleAttributes(newChild.text, - newChild) + newChild.text = \ + markdown.inlinepatterns.handleAttributes(newChild.text, + newChild) element.insert(i, newChild) i += 1 - return tree class PrettifyTreeprocessor(Treeprocessor): - """Add linebreaks to the html document.""" + """ Add linebreaks to the html document. """ + def _prettifyETree(self, elem): - """Recursively add linebreaks to ElementTree children.""" + """ Recursively add linebreaks to ElementTree children. """ + i = "\n" if markdown.isBlockLevel(elem.tag) and elem.tag not in ['code', 'pre']: if (not elem.text or not elem.text.strip()) \ @@ -311,7 +316,8 @@ class PrettifyTreeprocessor(Treeprocessor): elem.tail = i def run(self, root): - """.Add linebreaks to ElementTree root object.""" + """ Add linebreaks to ElementTree root object. """ + self._prettifyETree(root) # Do
's seperately as they are often in the middle of # inline content and missed by _prettifyETree. -- cgit v1.2.3