Fixed #47. Improved HRProccessor.\n\nPython's re module does not support atomic grouping, which was slowing the HR regex down if a long HR ended with a non HR char (casing the regex to backtrack). Therefore, we have to simulate atomic grouping. Fortunately, we only need to match end-of-line or end-of-string after the atomic group here, so it was an easy case to simulate. Just remove the '$' from the end of the regex and manualy check using m.end(). The run method was refactored while I was at it, saving us from running the regex twice for each HR.

author: Waylan Limberg <waylan@gmail.com> 2011-11-17 22:43:02 -0500
committer: Waylan Limberg <waylan@gmail.com> 2011-11-17 22:43:02 -0500
commit: ef9a229ebeaf8173e9fd4e541de4d83e8678f649 (patch)
tree: 615040f2b9778eaac1544bc2c1b26e039c0fdc1c
parent: c53307a4d555c04e97739fefe0cafc2e97d55328 (diff)
download: markdown-ef9a229ebeaf8173e9fd4e541de4d83e8678f649.tar.gz
markdown-ef9a229ebeaf8173e9fd4e541de4d83e8678f649.tar.bz2
markdown-ef9a229ebeaf8173e9fd4e541de4d83e8678f649.zip
3 files changed, 26 insertions, 19 deletions
diff --git a/markdown/blockprocessors.py b/markdown/blockprocessors.py
index 7223da4..de3f136 100644
--- a/markdown/blockprocessors.py
+++ b/markdown/blockprocessors.py
@@ -460,35 +460,36 @@ class SetextHeaderProcessor(BlockProcessor):
 class HRProcessor(BlockProcessor):
     """ Process Horizontal Rules. """
 
-    RE = r'[ ]{0,3}((-+[ ]{0,2}){3,}|(_+[ ]{0,2}){3,}|(\*+[ ]{0,2}){3,})[ ]*'
+    RE = r'^[ ]{0,3}((-+[ ]{0,2}){3,}|(_+[ ]{0,2}){3,}|(\*+[ ]{0,2}){3,})[ ]*'
     # Detect hr on any line of a block.
-    SEARCH_RE = re.compile(r'(^|\n)%s(\n|$)' % RE)
-    # Match a hr on a single line of text.
-    MATCH_RE = re.compile(r'^%s$' % RE)
+    SEARCH_RE = re.compile(RE, re.MULTILINE)
 
     def test(self, parent, block):
-        return bool(self.SEARCH_RE.search(block))
+        m = self.SEARCH_RE.search(block)
+        # No atomic grouping in python so we simulate it here for performance.
+        # The regex only matches what would be in the atomic group - the HR.
+        # Then check if we are at end of block or if next char is a newline.
+        if m and (m.end() == len(block) or block[m.end()] == '\n'):
+            # Save match object on class instance so we can use it later.
+            self.match = m
+            return True
+        return False
 
     def run(self, parent, blocks):
-        lines = blocks.pop(0).split('\n')
-        prelines = []
+        block = blocks.pop(0)
         # Check for lines in block before hr.
-        for line in lines:
-            m = self.MATCH_RE.match(line)
-            if m:
-                break
-            else:
-                prelines.append(line)
-        if len(prelines):
+        prelines = block[:self.match.start()].rstrip('\n')
+        if prelines:
             # Recursively parse lines before hr so they get parsed first.
-            self.parser.parseBlocks(parent, ['\n'.join(prelines)])
+            self.parser.parseBlocks(parent, [prelines])
         # create hr
         hr = util.etree.SubElement(parent, 'hr')
         # check for lines in block after hr.
-        lines = lines[len(prelines)+1:]
-        if len(lines):
+        postlines = block[self.match.end():].lstrip('\n')
+        if postlines:
             # Add lines after hr to master blocks for later parsing.
-            blocks.insert(0, '\n'.join(lines))
+            blocks.insert(0, postlines)
+
 
 
 class EmptyBlockProcessor(BlockProcessor):
diff --git a/tests/misc/para-with-hr.html b/tests/misc/para-with-hr.html
index 8569fec..7607449 100644
--- a/tests/misc/para-with-hr.html
+++ b/tests/misc/para-with-hr.html
@@ -1,3 +1,6 @@
 <p>Here is a paragraph, followed by a horizontal rule.</p>
 <hr />
-<p>Followed by another paragraph.</p>
-\ No newline at end of file
+<p>Followed by another paragraph.</p>
+<p>Here is another paragraph, followed by:
+*** not an HR.
+Followed by more of the same paragraph.</p>
+\ No newline at end of file
diff --git a/tests/misc/para-with-hr.txt b/tests/misc/para-with-hr.txt
index 20735fb..165bbe3 100644
--- a/tests/misc/para-with-hr.txt
+++ b/tests/misc/para-with-hr.txt
@@ -2,3 +2,6 @@ Here is a paragraph, followed by a horizontal rule.
 ***
 Followed by another paragraph.
 
+Here is another paragraph, followed by:
+*** not an HR.
+Followed by more of the same paragraph.
author	Waylan Limberg <waylan@gmail.com>	2011-11-17 22:43:02 -0500
committer	Waylan Limberg <waylan@gmail.com>	2011-11-17 22:43:02 -0500
commit	ef9a229ebeaf8173e9fd4e541de4d83e8678f649 (patch)
tree	615040f2b9778eaac1544bc2c1b26e039c0fdc1c
parent	c53307a4d555c04e97739fefe0cafc2e97d55328 (diff)
download	markdown-ef9a229ebeaf8173e9fd4e541de4d83e8678f649.tar.gz markdown-ef9a229ebeaf8173e9fd4e541de4d83e8678f649.tar.bz2 markdown-ef9a229ebeaf8173e9fd4e541de4d83e8678f649.zip