From 75c56efdc078fc3586a40a3dce7239819ded39a3 Mon Sep 17 00:00:00 2001 From: Artem Yunusov Date: Wed, 2 Jul 2008 00:40:00 +0500 Subject: New mechanism for processing Inline Patterns. --- markdown.py | 281 +++++++++++++++++++++++--------------- tests/markdown-test/benchmark.dat | 36 ++--- 2 files changed, 187 insertions(+), 130 deletions(-) diff --git a/markdown.py b/markdown.py index 92e180e..5e5586f 100644 --- a/markdown.py +++ b/markdown.py @@ -33,7 +33,7 @@ __revision__ = "$Rev$" -import re, sys, codecs +import re, sys, codecs, copy from urlparse import urlparse, urlunparse from logging import getLogger, StreamHandler, Formatter, \ @@ -124,6 +124,7 @@ def isBlockLevel (tag): return ( (tag in BLOCK_LEVEL_ELEMENTS) or (tag[0] == 'h' and tag[1] in "0123456789") ) + """ ====================================================================== ========================== NANODOM =================================== @@ -783,16 +784,18 @@ BACKTICK_RE = r'\`([^\`]*)\`' # `e= m*c^2` DOUBLE_BACKTICK_RE = r'\`\`(.*?)\`\`' # ``e=f("`")`` ESCAPE_RE = r'\\(.)' # \< EMPHASIS_RE = r'\*([^\*]*)\*' # *emphasis* -STRONG_RE = r'\*\*(.*?)\*\*' # **strong** -STRONG_EM_RE = r'\*\*\*(.*?)\*\*\*' # ***strong*** +STRONG_RE = r'\*\*(.*?|[^**]+?)\*\*' # **strong** +STRONG_EM_RE = r'\*\*\*(.*?|[^***]+?)\*\*\*' # ***strong*** + + if SMART_EMPHASIS: EMPHASIS_2_RE = r'(?\)' # [text]() @@ -1294,7 +1297,31 @@ def dequote(string): return string[1:-1] else: return string - + + +class InlineStash: + + def __init__(self): + self.prefix = "k@J!}" + self.suffix = "lL5Qt" + self._nodes = {} + self.phLength = 4 + len(self.prefix) + len(self.suffix) + + def _genPlaceholder(self): + hash = "%s%04d%s" % (self.prefix, len(self._nodes), self.suffix) + return hash + + def isin(self, placeholder): + return self._nodes.has_key(placeholder) + + def get(self, placeholder): + return self._nodes.get(placeholder) + + def add(self, node): + pholder = self._genPlaceholder() + self._nodes[pholder] = node + return pholder + """ ====================================================================== ========================== CORE MARKDOWN ============================= @@ -1387,7 +1414,33 @@ class Markdown: self.prePatterns = [] - + # temporarily disabled patterns + '''DOUBLE_BACKTICK_PATTERN, + BACKTICK_PATTERN, + ESCAPE_PATTERN, + REFERENCE_PATTERN, + LINK_ANGLED_PATTERN, + LINK_PATTERN, + IMAGE_LINK_PATTERN, + IMAGE_REFERENCE_PATTERN, + AUTOLINK_PATTERN, + AUTOMAIL_PATTERN, + LINE_BREAK_PATTERN_2, + LINE_BREAK_PATTERN, + HTML_PATTERN, + ENTITY_PATTERN, + NOT_STRONG_PATTERN, + STRONG_EM_PATTERN, + STRONG_EM_PATTERN_2,''' + + '''self.inlinePatterns = [LINK_PATTERN, + STRONG_PATTERN, + STRONG_PATTERN_2, + EMPHASIS_PATTERN, + EMPHASIS_PATTERN_2 + # The order of the handlers matters!!! + ]''' + self.inlinePatterns = [DOUBLE_BACKTICK_PATTERN, BACKTICK_PATTERN, ESCAPE_PATTERN, @@ -1395,8 +1448,8 @@ class Markdown: LINK_ANGLED_PATTERN, LINK_PATTERN, IMAGE_LINK_PATTERN, - IMAGE_REFERENCE_PATTERN, - AUTOLINK_PATTERN, + IMAGE_REFERENCE_PATTERN, + AUTOLINK_PATTERN, AUTOMAIL_PATTERN, LINE_BREAK_PATTERN_2, LINE_BREAK_PATTERN, @@ -1411,6 +1464,8 @@ class Markdown: EMPHASIS_PATTERN_2 # The order of the handlers matters!!! ] + + self.inlineStash = InlineStash() self.registerExtensions(extensions = extensions, configs = extension_configs) @@ -1453,6 +1508,7 @@ class Markdown: """ self.references={} self.htmlStash = HtmlStash() + self.inlineStash = InlineStash() HTML_BLOCK_PREPROCESSOR.stash = self.htmlStash LINE_PREPROCESSOR.stash = self.htmlStash @@ -1486,6 +1542,7 @@ class Markdown: self.top_element.setAttribute('class', 'markdown') self.doc.appendChild(self.top_element) + # Split into lines and run the preprocessors that will work with # self.lines @@ -1821,134 +1878,133 @@ class Markdown: text = "\n".join(detabbed).rstrip()+"\n" #text = text.replace("&", "&") code.appendChild(self.doc.createTextNode(text)) - self._processSection(parent_elem, theRest, inList) - - - - def _handleInline (self, line, patternIndex=0): + self._processSection(parent_elem, theRest, inList) + + def _handleInline(self, data, patternIndex=0): """ - Transform a Markdown line with inline elements to an XHTML - fragment. - - This function uses auxiliary objects called inline patterns. - See notes on inline patterns above. + Processinf string with inline patterns and replasing it + with placeholders Keyword arguments: - * line: A line of Markdown text + * data: A line of Markdown text * patternIndex: The index of the inlinePattern to start with - Return: A list of NanoDom nodes + Return: String with placeholders. """ - - - parts = [line] - + while patternIndex < len(self.inlinePatterns): + + data, matched = self._applyInline(self.inlinePatterns[patternIndex], + data, patternIndex) + if not matched: + patternIndex += 1 + + return data + #return self._processPlaceholders(data) + + + def _processPlaceholders(self, data): + """ + Processes string with placeholders and generates DOM tree. + + * data: string with placeholders instead of DOM elements. - i = 0 - - while i < len(parts): + Returns: NanoDOM Document object with applied inline patterns. + """ + + result = [] + prefix = self.inlineStash.prefix + strartIndex = 0 + while data: + + index = data.find(prefix, strartIndex) + if index != -1: - x = parts[i] - - if isinstance(x, (str, unicode)): - result = self._applyPattern(x, \ - self.inlinePatterns[patternIndex], \ - patternIndex) - - if result: - i -= 1 - parts.remove(x) - for y in result: - parts.insert(i+1,y) - - i += 1 - patternIndex += 1 - - for i in range(len(parts)): - x = parts[i] - if isinstance(x, (str, unicode)): - parts[i] = self.doc.createTextNode(x) + phEndIndex = index + self.inlineStash.phLength + placeHolder = data[index: phEndIndex] + + if self.inlineStash.isin(placeHolder): + + if index > 0: + + textNode = self.doc.createTextNode(data[strartIndex:index]) + + result.append(textNode) + + node = self.inlineStash.get(placeHolder) - return parts + if isinstance(node, Element): + for child in node.childNodes: + if isinstance(child, TextNode): + childResult = self._processPlaceholders(child.value) + pos = node.childNodes.index(child) + node.removeChild(child) + for newChild in childResult: + node.insertChild(pos, newChild) + + result.append(node) + + strartIndex = phEndIndex + + else: + + strartIndex = index + len(prefix) + else: + text = self.doc.createTextNode(data[strartIndex:]) + result.append(text) + data = "" + + return result + + - def _applyPattern(self, line, pattern, patternIndex): - + def _applyInline(self, pattern, data, patternIndex): """ Given a pattern name, this function checks if the line - fits the pattern, creates the necessary elements, and returns - back a list consisting of NanoDom elements and/or strings. + fits the pattern, creates the necessary elements, adds it + to InlineStash, and returns string with placeholders, + instead of DOM elements. Keyword arguments: - * line: the text to be processed + * data: the text to be processed * pattern: the pattern to be checked + * patternIndex: index of current pattern - Returns: The appropriate newly created NanoDom element if the - pattern matches, None otherwise. + Returns: String with placeholders. """ + + match = pattern.getCompiledRegExp().match(data) + + if not match: + return data, False - # match the line to pattern's pre-compiled reg exp. - # if no match, move on. - - - - m = pattern.getCompiledRegExp().match(line) - if not m: - return None - - # if we got a match let the pattern make us a NanoDom node - # if it doesn't, move on - node = pattern.handleMatch(m, self.doc) - - # check if any of this nodes have children that need processing - + node = pattern.handleMatch(match, self.doc) + if isinstance(node, Element): + for child in node.childNodes: + if isinstance(child, TextNode): + child.value = self._handleInline(child.value, patternIndex) + + + pholder = self.inlineStash.add(node) - if not node.nodeName in ["code", "pre"]: - for child in node.childNodes: - if isinstance(child, TextNode): - - result = self._handleInline(child.value, patternIndex+1) - - if result: - - if result == [child]: - continue - - result.reverse() - #to make insertion easier - - position = node.childNodes.index(child) - - node.removeChild(child) - - for item in result: - - if isinstance(item, (str, unicode)): - if len(item) > 0: - node.insertChild(position, - self.doc.createTextNode(item)) - else: - node.insertChild(position, item) - - - - - if node: - # Those are in the reverse order! - return ( m.groups()[-1], # the string to the left - node, # the new node - m.group(1)) # the string to the right of the match - - else: - return None + return "%s%s%s" % (match.group(1), pholder, match.groups()[-1]), True def _processTree(self, el): + """ + Processing NanoDOM markdown tree, and applying inline patterns + + Keyword arguments: + + * el - parent element of Document. + + Returns: NanoDOM Document object with applied inline patterns. + """ stack = [el] while stack: @@ -1958,7 +2014,8 @@ class Markdown: if child.type == "inline": - lst = self._handleInline(child.value) + lst = self._processPlaceholders(self._handleInline( + child.value)) pos = currElement.childNodes.index(child) @@ -1971,7 +2028,7 @@ class Markdown: del currElement.childNodes[pos] for newChild in lst: currElement.insertChild(pos, newChild) - pos += 1 + pos += 1 def applyInlinePatterns(self, markdownTree): """ @@ -2003,7 +2060,7 @@ class Markdown: * source: An ascii or unicode string of Markdown formated text. - Returns: NanoDOM document. + Returns: NanoDOM Document object. """ if source is not None: #Allow blank string self.source = source diff --git a/tests/markdown-test/benchmark.dat b/tests/markdown-test/benchmark.dat index 5b645ed..3d549dd 100644 --- a/tests/markdown-test/benchmark.dat +++ b/tests/markdown-test/benchmark.dat @@ -1,20 +1,20 @@ construction:0.000000:0.000000 -amps-and-angle-encoding:0.060000:0.000000 -auto-links:0.070000:135168.000000 -backlash-escapes:0.220000:360448.000000 +amps-and-angle-encoding:0.070000:131072.000000 +auto-links:0.080000:397312.000000 +backlash-escapes:0.270000:884736.000000 blockquotes-with-dode-blocks:0.020000:0.000000 -hard-wrapped:0.010000:0.000000 -horizontal-rules:0.140000:0.000000 -inline-html-advanced:0.060000:0.000000 -inline-html-comments:0.070000:0.000000 -inline-html-simple:0.170000:0.000000 -links-inline:0.100000:0.000000 -links-reference:0.120000:0.000000 -literal-quotes:0.070000:0.000000 -markdown-documentation-basics:0.740000:1175552.000000 -markdown-syntax:3.030000:2596864.000000 -nested-blockquotes:0.100000:0.000000 -ordered-and-unordered-list:0.360000:0.000000 -strong-and-em-together:0.110000:0.000000 -tabs:0.120000:0.000000 -tidyness:0.120000:0.000000 +hard-wrapped:0.020000:0.000000 +horizontal-rules:0.180000:135168.000000 +inline-html-advanced:0.070000:0.000000 +inline-html-comments:0.080000:0.000000 +inline-html-simple:0.210000:0.000000 +links-inline:0.140000:0.000000 +links-reference:0.170000:0.000000 +literal-quotes:0.090000:0.000000 +markdown-documentation-basics:0.690000:1806336.000000 +markdown-syntax:3.310000:6696960.000000 +nested-blockquotes:0.200000:0.000000 +ordered-and-unordered-list:0.530000:0.000000 +strong-and-em-together:0.200000:0.000000 +tabs:0.200000:0.000000 +tidyness:0.200000:0.000000 -- cgit v1.2.3