From ba147ca9b2eae544e802c8216936065d2d86a8d8 Mon Sep 17 00:00:00 2001 From: Waylan Limberg Date: Tue, 11 Nov 2008 19:18:41 -0500 Subject: Replaced old core parser with new BlockParser and copied old core into extension. --- markdown_extensions/legacy.py | 469 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 469 insertions(+) create mode 100644 markdown_extensions/legacy.py (limited to 'markdown_extensions') diff --git a/markdown_extensions/legacy.py b/markdown_extensions/legacy.py new file mode 100644 index 0000000..1320734 --- /dev/null +++ b/markdown_extensions/legacy.py @@ -0,0 +1,469 @@ +""" +Legacy Extension for Python-Markdown +==================================== + +Replaces the core parser with the old one. + +""" + +import markdown + +"""Basic and reusable regular expressions.""" + +def wrapRe(raw_re) : return re.compile("^%s$" % raw_re, re.DOTALL) +CORE_RE = { + 'header': wrapRe(r'(#{1,6})[ \t]*(.*?)[ \t]*(#*)'), # # A title + 'reference-def': wrapRe(r'(\ ?\ ?\ ?)\[([^\]]*)\]:\s*([^ ]*)(.*)'), + # [Google]: http://www.google.com/ + 'containsline': wrapRe(r'([-]*)$|^([=]*)'), # -----, =====, etc. + 'ol': wrapRe(r'[ ]{0,3}[\d]*\.\s+(.*)'), # 1. text + 'ul': wrapRe(r'[ ]{0,3}[*+-]\s+(.*)'), # "* text" + 'isline1': wrapRe(r'(\**)'), # *** + 'isline2': wrapRe(r'(\-*)'), # --- + 'isline3': wrapRe(r'(\_*)'), # ___ + 'tabbed': wrapRe(r'((\t)|( ))(.*)'), # an indented line + 'quoted': wrapRe(r'[ ]{0,2}> ?(.*)'), # a quoted block ("> ...") + 'containsline': re.compile(r'^([-]*)$|^([=]*)$', re.M), + 'attr': re.compile("\{@([^\}]*)=([^\}]*)}") # {@id=123} +} + +class MarkdownParser: + """Parser Markdown into a ElementTree.""" + + def __init__(self): + pass + + def parseDocument(self, lines): + """Parse a markdown string into an ElementTree.""" + # Create a ElementTree from the lines + root = etree.Element("div") + buffer = [] + for line in lines: + if line.startswith("#"): + self.parseChunk(root, buffer) + buffer = [line] + else: + buffer.append(line) + + self.parseChunk(root, buffer) + + return etree.ElementTree(root) + + def parseChunk(self, parent_elem, lines, inList=0, looseList=0): + """Process a chunk of markdown-formatted text and attach the parse to + an ElementTree node. + + Process a section of a source document, looking for high + level structural elements like lists, block quotes, code + segments, html blocks, etc. Some those then get stripped + of their high level markup (e.g. get unindented) and the + lower-level markup is processed recursively. + + Keyword arguments: + + * parent_elem: The ElementTree element to which the content will be + added. + * lines: a list of lines + * inList: a level + + Returns: None + + """ + # Loop through lines until none left. + while lines: + # Skipping empty line + if not lines[0]: + lines = lines[1:] + continue + + # Check if this section starts with a list, a blockquote or + # a code block. If so, process them. + processFn = { 'ul': self.__processUList, + 'ol': self.__processOList, + 'quoted': self.__processQuote, + 'tabbed': self.__processCodeBlock} + for regexp in ['ul', 'ol', 'quoted', 'tabbed']: + m = CORE_RE[regexp].match(lines[0]) + if m: + processFn[regexp](parent_elem, lines, inList) + return + + # We are NOT looking at one of the high-level structures like + # lists or blockquotes. So, it's just a regular paragraph + # (though perhaps nested inside a list or something else). If + # we are NOT inside a list, we just need to look for a blank + # line to find the end of the block. If we ARE inside a + # list, however, we need to consider that a sublist does not + # need to be separated by a blank line. Rather, the following + # markup is legal: + # + # * The top level list item + # + # Another paragraph of the list. This is where we are now. + # * Underneath we might have a sublist. + # + + if inList: + start, lines = self.__linesUntil(lines, (lambda line: + CORE_RE['ul'].match(line) + or CORE_RE['ol'].match(line) + or not line.strip())) + self.parseChunk(parent_elem, start, inList-1, + looseList=looseList) + inList = inList-1 + + else: # Ok, so it's just a simple block + test = lambda line: not line.strip() or line[0] == '>' + paragraph, lines = self.__linesUntil(lines, test) + if len(paragraph) and paragraph[0].startswith('#'): + self.__processHeader(parent_elem, paragraph) + elif len(paragraph) and CORE_RE["isline3"].match(paragraph[0]): + self.__processHR(parent_elem) + lines = paragraph[1:] + lines + elif paragraph: + self.__processParagraph(parent_elem, paragraph, + inList, looseList) + + if lines and not lines[0].strip(): + lines = lines[1:] # skip the first (blank) line + + def __processHR(self, parentElem): + hr = etree.SubElement(parentElem, "hr") + + def __processHeader(self, parentElem, paragraph): + m = CORE_RE['header'].match(paragraph[0]) + if m: + level = len(m.group(1)) + h = etree.SubElement(parentElem, "h%d" % level) + h.text = m.group(2).strip() + else: + message(CRITICAL, "We've got a problem header!") + + def __processParagraph(self, parentElem, paragraph, inList, looseList): + + if ( parentElem.tag == 'li' + and not (looseList or parentElem.getchildren())): + + # If this is the first paragraph inside "li", don't + # put

around it - append the paragraph bits directly + # onto parentElem + el = parentElem + else: + # Otherwise make a "p" element + el = etree.SubElement(parentElem, "p") + + dump = [] + + # Searching for hr or header + for line in paragraph: + # it's hr + if CORE_RE["isline3"].match(line): + el.text = "\n".join(dump) + self.__processHR(el) + dump = [] + # it's header + elif line.startswith("#"): + el.text = "\n".join(dump) + self.__processHeader(parentElem, [line]) + dump = [] + else: + dump.append(line) + if dump: + text = "\n".join(dump) + el.text = text + + def __processUList(self, parentElem, lines, inList): + self.__processList(parentElem, lines, inList, listexpr='ul', tag='ul') + + def __processOList(self, parentElem, lines, inList): + self.__processList(parentElem, lines, inList, listexpr='ol', tag='ol') + + def __processList(self, parentElem, lines, inList, listexpr, tag): + """ + Given a list of document lines starting with a list item, + finds the end of the list, breaks it up, and recursively + processes each list item and the remainder of the text file. + + Keyword arguments: + + * parentElem: A ElementTree element to which the content will be added + * lines: a list of lines + * inList: a level + + Returns: None + + """ + ul = etree.SubElement(parentElem, tag) # ul might actually be '

    ' + + looseList = 0 + + # Make a list of list items + items = [] + item = -1 + + i = 0 # a counter to keep track of where we are + for line in lines: + loose = 0 + if not line.strip(): + # If we see a blank line, this _might_ be the end of the list + i += 1 + loose = 1 + + # Find the next non-blank line + for j in range(i, len(lines)): + if lines[j].strip(): + next = lines[j] + break + else: + # There is no more text => end of the list + break + + # Check if the next non-blank line is still a part of the list + + if ( CORE_RE[listexpr].match(next) or + CORE_RE['tabbed'].match(next) ): + # get rid of any white space in the line + items[item].append(line.strip()) + looseList = loose or looseList + continue + else: + break # found end of the list + + # Now we need to detect list items (at the current level) + # while also detabing child elements if necessary + + for expr in ['ul', 'ol', 'tabbed']: + m = CORE_RE[expr].match(line) + if m: + if expr in ['ul', 'ol']: # We are looking at a new item + #if m.group(1) : + # Removed the check to allow for a blank line + # at the beginning of the list item + items.append([m.group(1)]) + item += 1 + elif expr == 'tabbed': # This line needs to be detabbed + items[item].append(m.group(4)) #after the 'tab' + i += 1 + break + else: + items[item].append(line) # Just regular continuation + i += 1 # added on 2006.02.25 + else: + i += 1 + + # Add the ElementTree elements + for item in items: + li = etree.SubElement(ul, "li") + self.parseChunk(li, item, inList + 1, looseList = looseList) + + # Process the remaining part of the section + self.parseChunk(parentElem, lines[i:], inList) + + def __linesUntil(self, lines, condition): + """ + A utility function to break a list of lines upon the + first line that satisfied a condition. The condition + argument should be a predicate function. + + """ + i = -1 + for line in lines: + i += 1 + if condition(line): + break + else: + i += 1 + return lines[:i], lines[i:] + + def __processQuote(self, parentElem, lines, inList): + """ + Given a list of document lines starting with a quote finds + the end of the quote, unindents it and recursively + processes the body of the quote and the remainder of the + text file. + + Keyword arguments: + + * parentElem: ElementTree element to which the content will be added + * lines: a list of lines + * inList: a level + + Returns: None + + """ + dequoted = [] + i = 0 + blank_line = False # allow one blank line between paragraphs + for line in lines: + m = CORE_RE['quoted'].match(line) + if m: + dequoted.append(m.group(1)) + i += 1 + blank_line = False + elif not blank_line and line.strip() != '': + dequoted.append(line) + i += 1 + elif not blank_line and line.strip() == '': + dequoted.append(line) + i += 1 + blank_line = True + else: + break + + blockquote = etree.SubElement(parentElem, "blockquote") + + self.parseChunk(blockquote, dequoted, inList) + self.parseChunk(parentElem, lines[i:], inList) + + def __processCodeBlock(self, parentElem, lines, inList): + """ + Given a list of document lines starting with a code block + finds the end of the block, puts it into the ElementTree verbatim + wrapped in ("
    ") and recursively processes the
    +        the remainder of the text file.
    +
    +        Keyword arguments:
    +
    +        * parentElem: ElementTree element to which the content will be added
    +        * lines: a list of lines
    +        * inList: a level
    +
    +        Returns: None
    +
    +        """
    +        detabbed, theRest = self.detectTabbed(lines)
    +        pre = etree.SubElement(parentElem, "pre")
    +        code = etree.SubElement(pre, "code")
    +        text = "\n".join(detabbed).rstrip()+"\n"
    +        code.text = AtomicString(text)
    +        self.parseChunk(parentElem, theRest, inList)
    +
    +    def detectTabbed(self, lines):
    +        """ Find indented text and remove indent before further proccesing.
    +
    +        Keyword arguments:
    +
    +        * lines: an array of strings
    +        * fn: a function that returns a substring of a string
    +           if the string matches the necessary criteria
    +
    +        Returns: a list of post processes items and the unused
    +        remainder of the original list
    +
    +        """
    +        items = []
    +        item = -1
    +        i = 0 # to keep track of where we are
    +
    +        def detab(line):
    +            match = CORE_RE['tabbed'].match(line)
    +            if match:
    +               return match.group(4)
    +
    +        for line in lines:
    +            if line.strip(): # Non-blank line
    +                line = detab(line)
    +                if line:
    +                    items.append(line)
    +                    i += 1
    +                    continue
    +                else:
    +                    return items, lines[i:]
    +
    +            else: # Blank line: _maybe_ we are done.
    +                i += 1 # advance
    +
    +                # Find the next non-blank line
    +                for j in range(i, len(lines)):
    +                    if lines[j].strip():
    +                        next_line = lines[j]; break
    +                else:
    +                    break # There is no more text; we are done.
    +
    +                # Check if the next non-blank line is tabbed
    +                if detab(next_line): # Yes, more work to do.
    +                    items.append("")
    +                    continue
    +                else:
    +                    break # No, we are done.
    +        else:
    +            i += 1
    +
    +        return items, lines[i:]
    +
    +class HeaderPreprocessor(Preprocessor):
    +
    +    """Replace underlined headers with hashed headers.
    +
    +    (To avoid the need for lookahead later.)
    +
    +    """
    +
    +    def run (self, lines):
    +        i = -1
    +        while i+1 < len(lines):
    +            i = i+1
    +            if not lines[i].strip():
    +                continue
    +
    +            if lines[i].startswith("#"):
    +                lines.insert(i+1, "\n")
    +
    +            if (i+1 <= len(lines)
    +                  and lines[i+1]
    +                  and lines[i+1][0] in ['-', '=']):
    +
    +                underline = lines[i+1].strip()
    +
    +                if underline == "="*len(underline):
    +                    lines[i] = "# " + lines[i].strip()
    +                    lines[i+1] = ""
    +                elif underline == "-"*len(underline):
    +                    lines[i] = "## " + lines[i].strip()
    +                    lines[i+1] = ""
    +
    +        return lines
    +
    +
    +class LinePreprocessor(Preprocessor):
    +    """Convert HR lines to "___" format."""
    +    blockquote_re = re.compile(r'^(> )+')
    +
    +    def run (self, lines):
    +        for i in range(len(lines)):
    +            prefix = ''
    +            m = self.blockquote_re.search(lines[i])
    +            if m:
    +                prefix = m.group(0)
    +            if self._isLine(lines[i][len(prefix):]):
    +                lines[i] = prefix + "___"
    +        return lines
    +
    +    def _isLine(self, block):
    +        """Determine if a block should be replaced with an 
    """ + if block.startswith(" "): + return False # a code block + text = "".join([x for x in block if not x.isspace()]) + if len(text) <= 2: + return False + for pattern in ['isline1', 'isline2', 'isline3']: + m = CORE_RE[pattern].match(text) + if (m and m.group(1)): + return True + else: + return False + + +class LegacyExtension(markdown.Extension): + """ Replace Markdown's core parser. """ + + def extendMarkdown(self, md, md_globals): + """ Set the core parser to an instance of MarkdownParser. """ + md.parser = MarkdownParser() + md.preprocessors.add ("header", HeaderPreprocessor(self), "