aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorWaylan Limberg <waylan@gmail.com>2011-11-17 22:43:02 -0500
committerWaylan Limberg <waylan@gmail.com>2011-11-17 22:43:02 -0500
commitef9a229ebeaf8173e9fd4e541de4d83e8678f649 (patch)
tree615040f2b9778eaac1544bc2c1b26e039c0fdc1c
parentc53307a4d555c04e97739fefe0cafc2e97d55328 (diff)
downloadmarkdown-ef9a229ebeaf8173e9fd4e541de4d83e8678f649.tar.gz
markdown-ef9a229ebeaf8173e9fd4e541de4d83e8678f649.tar.bz2
markdown-ef9a229ebeaf8173e9fd4e541de4d83e8678f649.zip
Fixed #47. Improved HRProccessor.\n\nPython's re module does not support atomic grouping, which was slowing the HR regex down if a long HR ended with a non HR char (casing the regex to backtrack). Therefore, we have to simulate atomic grouping. Fortunately, we only need to match end-of-line or end-of-string after the atomic group here, so it was an easy case to simulate. Just remove the '$' from the end of the regex and manualy check using m.end(). The run method was refactored while I was at it, saving us from running the regex twice for each HR.
-rw-r--r--markdown/blockprocessors.py37
-rw-r--r--tests/misc/para-with-hr.html5
-rw-r--r--tests/misc/para-with-hr.txt3
3 files changed, 26 insertions, 19 deletions
diff --git a/markdown/blockprocessors.py b/markdown/blockprocessors.py
index 7223da4..de3f136 100644
--- a/markdown/blockprocessors.py
+++ b/markdown/blockprocessors.py
@@ -460,35 +460,36 @@ class SetextHeaderProcessor(BlockProcessor):
class HRProcessor(BlockProcessor):
""" Process Horizontal Rules. """
- RE = r'[ ]{0,3}((-+[ ]{0,2}){3,}|(_+[ ]{0,2}){3,}|(\*+[ ]{0,2}){3,})[ ]*'
+ RE = r'^[ ]{0,3}((-+[ ]{0,2}){3,}|(_+[ ]{0,2}){3,}|(\*+[ ]{0,2}){3,})[ ]*'
# Detect hr on any line of a block.
- SEARCH_RE = re.compile(r'(^|\n)%s(\n|$)' % RE)
- # Match a hr on a single line of text.
- MATCH_RE = re.compile(r'^%s$' % RE)
+ SEARCH_RE = re.compile(RE, re.MULTILINE)
def test(self, parent, block):
- return bool(self.SEARCH_RE.search(block))
+ m = self.SEARCH_RE.search(block)
+ # No atomic grouping in python so we simulate it here for performance.
+ # The regex only matches what would be in the atomic group - the HR.
+ # Then check if we are at end of block or if next char is a newline.
+ if m and (m.end() == len(block) or block[m.end()] == '\n'):
+ # Save match object on class instance so we can use it later.
+ self.match = m
+ return True
+ return False
def run(self, parent, blocks):
- lines = blocks.pop(0).split('\n')
- prelines = []
+ block = blocks.pop(0)
# Check for lines in block before hr.
- for line in lines:
- m = self.MATCH_RE.match(line)
- if m:
- break
- else:
- prelines.append(line)
- if len(prelines):
+ prelines = block[:self.match.start()].rstrip('\n')
+ if prelines:
# Recursively parse lines before hr so they get parsed first.
- self.parser.parseBlocks(parent, ['\n'.join(prelines)])
+ self.parser.parseBlocks(parent, [prelines])
# create hr
hr = util.etree.SubElement(parent, 'hr')
# check for lines in block after hr.
- lines = lines[len(prelines)+1:]
- if len(lines):
+ postlines = block[self.match.end():].lstrip('\n')
+ if postlines:
# Add lines after hr to master blocks for later parsing.
- blocks.insert(0, '\n'.join(lines))
+ blocks.insert(0, postlines)
+
class EmptyBlockProcessor(BlockProcessor):
diff --git a/tests/misc/para-with-hr.html b/tests/misc/para-with-hr.html
index 8569fec..7607449 100644
--- a/tests/misc/para-with-hr.html
+++ b/tests/misc/para-with-hr.html
@@ -1,3 +1,6 @@
<p>Here is a paragraph, followed by a horizontal rule.</p>
<hr />
-<p>Followed by another paragraph.</p> \ No newline at end of file
+<p>Followed by another paragraph.</p>
+<p>Here is another paragraph, followed by:
+*** not an HR.
+Followed by more of the same paragraph.</p> \ No newline at end of file
diff --git a/tests/misc/para-with-hr.txt b/tests/misc/para-with-hr.txt
index 20735fb..165bbe3 100644
--- a/tests/misc/para-with-hr.txt
+++ b/tests/misc/para-with-hr.txt
@@ -2,3 +2,6 @@ Here is a paragraph, followed by a horizontal rule.
***
Followed by another paragraph.
+Here is another paragraph, followed by:
+*** not an HR.
+Followed by more of the same paragraph.