aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorArtem Yunusov <nedrlab@gmail.com>2008-07-02 00:40:00 +0500
committerArtem Yunusov <nedrlab@gmail.com>2008-07-02 00:40:00 +0500
commit75c56efdc078fc3586a40a3dce7239819ded39a3 (patch)
treead15cba26a1b2ae214550fc4c1852a14b46f05a4
parentc6cd71006fb9544b0840547570e30348a55ab851 (diff)
downloadmarkdown-75c56efdc078fc3586a40a3dce7239819ded39a3.tar.gz
markdown-75c56efdc078fc3586a40a3dce7239819ded39a3.tar.bz2
markdown-75c56efdc078fc3586a40a3dce7239819ded39a3.zip
New mechanism for processing Inline Patterns.
-rw-r--r--markdown.py281
-rw-r--r--tests/markdown-test/benchmark.dat36
2 files changed, 187 insertions, 130 deletions
diff --git a/markdown.py b/markdown.py
index 92e180e..5e5586f 100644
--- a/markdown.py
+++ b/markdown.py
@@ -33,7 +33,7 @@ __revision__ = "$Rev$"
-import re, sys, codecs
+import re, sys, codecs, copy
from urlparse import urlparse, urlunparse
from logging import getLogger, StreamHandler, Formatter, \
@@ -124,6 +124,7 @@ def isBlockLevel (tag):
return ( (tag in BLOCK_LEVEL_ELEMENTS) or
(tag[0] == 'h' and tag[1] in "0123456789") )
+
"""
======================================================================
========================== NANODOM ===================================
@@ -783,16 +784,18 @@ BACKTICK_RE = r'\`([^\`]*)\`' # `e= m*c^2`
DOUBLE_BACKTICK_RE = r'\`\`(.*?)\`\`' # ``e=f("`")``
ESCAPE_RE = r'\\(.)' # \<
EMPHASIS_RE = r'\*([^\*]*)\*' # *emphasis*
-STRONG_RE = r'\*\*(.*?)\*\*' # **strong**
-STRONG_EM_RE = r'\*\*\*(.*?)\*\*\*' # ***strong***
+STRONG_RE = r'\*\*(.*?|[^**]+?)\*\*' # **strong**
+STRONG_EM_RE = r'\*\*\*(.*?|[^***]+?)\*\*\*' # ***strong***
+
+
if SMART_EMPHASIS:
EMPHASIS_2_RE = r'(?<!\S)_(\S[^_]*)_' # _emphasis_
else:
EMPHASIS_2_RE = r'_([^_]*)_' # _emphasis_
-STRONG_2_RE = r'__(.*?)__' # __strong__
-STRONG_EM_2_RE = r'___(.*?)___' # ___strong___
+STRONG_2_RE = r'__(.*?|[^__]+?)__' # __strong__
+STRONG_EM_2_RE = r'___(.*?|[^___]+?)___' # ___strong___
LINK_RE = NOIMG + BRK + r'\s*\(([^\)]*)\)' # [text](url)
LINK_ANGLED_RE = NOIMG + BRK + r'\s*\(<([^\)]*)>\)' # [text](<url>)
@@ -1294,7 +1297,31 @@ def dequote(string):
return string[1:-1]
else:
return string
-
+
+
+class InlineStash:
+
+ def __init__(self):
+ self.prefix = "k@J!}"
+ self.suffix = "lL5Qt"
+ self._nodes = {}
+ self.phLength = 4 + len(self.prefix) + len(self.suffix)
+
+ def _genPlaceholder(self):
+ hash = "%s%04d%s" % (self.prefix, len(self._nodes), self.suffix)
+ return hash
+
+ def isin(self, placeholder):
+ return self._nodes.has_key(placeholder)
+
+ def get(self, placeholder):
+ return self._nodes.get(placeholder)
+
+ def add(self, node):
+ pholder = self._genPlaceholder()
+ self._nodes[pholder] = node
+ return pholder
+
"""
======================================================================
========================== CORE MARKDOWN =============================
@@ -1387,7 +1414,33 @@ class Markdown:
self.prePatterns = []
-
+ # temporarily disabled patterns
+ '''DOUBLE_BACKTICK_PATTERN,
+ BACKTICK_PATTERN,
+ ESCAPE_PATTERN,
+ REFERENCE_PATTERN,
+ LINK_ANGLED_PATTERN,
+ LINK_PATTERN,
+ IMAGE_LINK_PATTERN,
+ IMAGE_REFERENCE_PATTERN,
+ AUTOLINK_PATTERN,
+ AUTOMAIL_PATTERN,
+ LINE_BREAK_PATTERN_2,
+ LINE_BREAK_PATTERN,
+ HTML_PATTERN,
+ ENTITY_PATTERN,
+ NOT_STRONG_PATTERN,
+ STRONG_EM_PATTERN,
+ STRONG_EM_PATTERN_2,'''
+
+ '''self.inlinePatterns = [LINK_PATTERN,
+ STRONG_PATTERN,
+ STRONG_PATTERN_2,
+ EMPHASIS_PATTERN,
+ EMPHASIS_PATTERN_2
+ # The order of the handlers matters!!!
+ ]'''
+
self.inlinePatterns = [DOUBLE_BACKTICK_PATTERN,
BACKTICK_PATTERN,
ESCAPE_PATTERN,
@@ -1395,8 +1448,8 @@ class Markdown:
LINK_ANGLED_PATTERN,
LINK_PATTERN,
IMAGE_LINK_PATTERN,
- IMAGE_REFERENCE_PATTERN,
- AUTOLINK_PATTERN,
+ IMAGE_REFERENCE_PATTERN,
+ AUTOLINK_PATTERN,
AUTOMAIL_PATTERN,
LINE_BREAK_PATTERN_2,
LINE_BREAK_PATTERN,
@@ -1411,6 +1464,8 @@ class Markdown:
EMPHASIS_PATTERN_2
# The order of the handlers matters!!!
]
+
+ self.inlineStash = InlineStash()
self.registerExtensions(extensions = extensions,
configs = extension_configs)
@@ -1453,6 +1508,7 @@ class Markdown:
"""
self.references={}
self.htmlStash = HtmlStash()
+ self.inlineStash = InlineStash()
HTML_BLOCK_PREPROCESSOR.stash = self.htmlStash
LINE_PREPROCESSOR.stash = self.htmlStash
@@ -1486,6 +1542,7 @@ class Markdown:
self.top_element.setAttribute('class', 'markdown')
self.doc.appendChild(self.top_element)
+
# Split into lines and run the preprocessors that will work with
# self.lines
@@ -1821,134 +1878,133 @@ class Markdown:
text = "\n".join(detabbed).rstrip()+"\n"
#text = text.replace("&", "&amp;")
code.appendChild(self.doc.createTextNode(text))
- self._processSection(parent_elem, theRest, inList)
-
-
-
- def _handleInline (self, line, patternIndex=0):
+ self._processSection(parent_elem, theRest, inList)
+
+ def _handleInline(self, data, patternIndex=0):
"""
- Transform a Markdown line with inline elements to an XHTML
- fragment.
-
- This function uses auxiliary objects called inline patterns.
- See notes on inline patterns above.
+ Processinf string with inline patterns and replasing it
+ with placeholders
Keyword arguments:
- * line: A line of Markdown text
+ * data: A line of Markdown text
* patternIndex: The index of the inlinePattern to start with
- Return: A list of NanoDom nodes
+ Return: String with placeholders.
"""
-
-
- parts = [line]
-
+
while patternIndex < len(self.inlinePatterns):
+
+ data, matched = self._applyInline(self.inlinePatterns[patternIndex],
+ data, patternIndex)
+ if not matched:
+ patternIndex += 1
+
+ return data
+ #return self._processPlaceholders(data)
+
+
+ def _processPlaceholders(self, data):
+ """
+ Processes string with placeholders and generates DOM tree.
+
+ * data: string with placeholders instead of DOM elements.
- i = 0
-
- while i < len(parts):
+ Returns: NanoDOM Document object with applied inline patterns.
+ """
+
+ result = []
+ prefix = self.inlineStash.prefix
+ strartIndex = 0
+ while data:
+
+ index = data.find(prefix, strartIndex)
+ if index != -1:
- x = parts[i]
-
- if isinstance(x, (str, unicode)):
- result = self._applyPattern(x, \
- self.inlinePatterns[patternIndex], \
- patternIndex)
-
- if result:
- i -= 1
- parts.remove(x)
- for y in result:
- parts.insert(i+1,y)
-
- i += 1
- patternIndex += 1
-
- for i in range(len(parts)):
- x = parts[i]
- if isinstance(x, (str, unicode)):
- parts[i] = self.doc.createTextNode(x)
+ phEndIndex = index + self.inlineStash.phLength
+ placeHolder = data[index: phEndIndex]
+
+ if self.inlineStash.isin(placeHolder):
+
+ if index > 0:
+
+ textNode = self.doc.createTextNode(data[strartIndex:index])
+
+ result.append(textNode)
+
+ node = self.inlineStash.get(placeHolder)
- return parts
+ if isinstance(node, Element):
+ for child in node.childNodes:
+ if isinstance(child, TextNode):
+ childResult = self._processPlaceholders(child.value)
+ pos = node.childNodes.index(child)
+ node.removeChild(child)
+ for newChild in childResult:
+ node.insertChild(pos, newChild)
+
+ result.append(node)
+
+ strartIndex = phEndIndex
+
+ else:
+
+ strartIndex = index + len(prefix)
+ else:
+ text = self.doc.createTextNode(data[strartIndex:])
+ result.append(text)
+ data = ""
+
+ return result
+
+
- def _applyPattern(self, line, pattern, patternIndex):
-
+ def _applyInline(self, pattern, data, patternIndex):
"""
Given a pattern name, this function checks if the line
- fits the pattern, creates the necessary elements, and returns
- back a list consisting of NanoDom elements and/or strings.
+ fits the pattern, creates the necessary elements, adds it
+ to InlineStash, and returns string with placeholders,
+ instead of DOM elements.
Keyword arguments:
- * line: the text to be processed
+ * data: the text to be processed
* pattern: the pattern to be checked
+ * patternIndex: index of current pattern
- Returns: The appropriate newly created NanoDom element if the
- pattern matches, None otherwise.
+ Returns: String with placeholders.
"""
+
+ match = pattern.getCompiledRegExp().match(data)
+
+ if not match:
+ return data, False
- # match the line to pattern's pre-compiled reg exp.
- # if no match, move on.
-
-
-
- m = pattern.getCompiledRegExp().match(line)
- if not m:
- return None
-
- # if we got a match let the pattern make us a NanoDom node
- # if it doesn't, move on
- node = pattern.handleMatch(m, self.doc)
-
- # check if any of this nodes have children that need processing
-
+ node = pattern.handleMatch(match, self.doc)
+
if isinstance(node, Element):
+ for child in node.childNodes:
+ if isinstance(child, TextNode):
+ child.value = self._handleInline(child.value, patternIndex)
+
+
+ pholder = self.inlineStash.add(node)
- if not node.nodeName in ["code", "pre"]:
- for child in node.childNodes:
- if isinstance(child, TextNode):
-
- result = self._handleInline(child.value, patternIndex+1)
-
- if result:
-
- if result == [child]:
- continue
-
- result.reverse()
- #to make insertion easier
-
- position = node.childNodes.index(child)
-
- node.removeChild(child)
-
- for item in result:
-
- if isinstance(item, (str, unicode)):
- if len(item) > 0:
- node.insertChild(position,
- self.doc.createTextNode(item))
- else:
- node.insertChild(position, item)
-
-
-
-
- if node:
- # Those are in the reverse order!
- return ( m.groups()[-1], # the string to the left
- node, # the new node
- m.group(1)) # the string to the right of the match
-
- else:
- return None
+ return "%s%s%s" % (match.group(1), pholder, match.groups()[-1]), True
def _processTree(self, el):
+ """
+ Processing NanoDOM markdown tree, and applying inline patterns
+
+ Keyword arguments:
+
+ * el - parent element of Document.
+
+ Returns: NanoDOM Document object with applied inline patterns.
+ """
stack = [el]
while stack:
@@ -1958,7 +2014,8 @@ class Markdown:
if child.type == "inline":
- lst = self._handleInline(child.value)
+ lst = self._processPlaceholders(self._handleInline(
+ child.value))
pos = currElement.childNodes.index(child)
@@ -1971,7 +2028,7 @@ class Markdown:
del currElement.childNodes[pos]
for newChild in lst:
currElement.insertChild(pos, newChild)
- pos += 1
+ pos += 1
def applyInlinePatterns(self, markdownTree):
"""
@@ -2003,7 +2060,7 @@ class Markdown:
* source: An ascii or unicode string of Markdown formated text.
- Returns: NanoDOM document.
+ Returns: NanoDOM Document object.
"""
if source is not None: #Allow blank string
self.source = source
diff --git a/tests/markdown-test/benchmark.dat b/tests/markdown-test/benchmark.dat
index 5b645ed..3d549dd 100644
--- a/tests/markdown-test/benchmark.dat
+++ b/tests/markdown-test/benchmark.dat
@@ -1,20 +1,20 @@
construction:0.000000:0.000000
-amps-and-angle-encoding:0.060000:0.000000
-auto-links:0.070000:135168.000000
-backlash-escapes:0.220000:360448.000000
+amps-and-angle-encoding:0.070000:131072.000000
+auto-links:0.080000:397312.000000
+backlash-escapes:0.270000:884736.000000
blockquotes-with-dode-blocks:0.020000:0.000000
-hard-wrapped:0.010000:0.000000
-horizontal-rules:0.140000:0.000000
-inline-html-advanced:0.060000:0.000000
-inline-html-comments:0.070000:0.000000
-inline-html-simple:0.170000:0.000000
-links-inline:0.100000:0.000000
-links-reference:0.120000:0.000000
-literal-quotes:0.070000:0.000000
-markdown-documentation-basics:0.740000:1175552.000000
-markdown-syntax:3.030000:2596864.000000
-nested-blockquotes:0.100000:0.000000
-ordered-and-unordered-list:0.360000:0.000000
-strong-and-em-together:0.110000:0.000000
-tabs:0.120000:0.000000
-tidyness:0.120000:0.000000
+hard-wrapped:0.020000:0.000000
+horizontal-rules:0.180000:135168.000000
+inline-html-advanced:0.070000:0.000000
+inline-html-comments:0.080000:0.000000
+inline-html-simple:0.210000:0.000000
+links-inline:0.140000:0.000000
+links-reference:0.170000:0.000000
+literal-quotes:0.090000:0.000000
+markdown-documentation-basics:0.690000:1806336.000000
+markdown-syntax:3.310000:6696960.000000
+nested-blockquotes:0.200000:0.000000
+ordered-and-unordered-list:0.530000:0.000000
+strong-and-em-together:0.200000:0.000000
+tabs:0.200000:0.000000
+tidyness:0.200000:0.000000