diff options
author | Waylan Limberg <waylan@gmail.com> | 2011-11-17 22:43:02 -0500 |
---|---|---|
committer | Waylan Limberg <waylan@gmail.com> | 2011-11-17 22:43:02 -0500 |
commit | ef9a229ebeaf8173e9fd4e541de4d83e8678f649 (patch) | |
tree | 615040f2b9778eaac1544bc2c1b26e039c0fdc1c | |
parent | c53307a4d555c04e97739fefe0cafc2e97d55328 (diff) | |
download | markdown-ef9a229ebeaf8173e9fd4e541de4d83e8678f649.tar.gz markdown-ef9a229ebeaf8173e9fd4e541de4d83e8678f649.tar.bz2 markdown-ef9a229ebeaf8173e9fd4e541de4d83e8678f649.zip |
Fixed #47. Improved HRProccessor.\n\nPython's re module does not support atomic grouping, which was slowing the HR regex down if a long HR ended with a non HR char (casing the regex to backtrack). Therefore, we have to simulate atomic grouping. Fortunately, we only need to match end-of-line or end-of-string after the atomic group here, so it was an easy case to simulate. Just remove the '$' from the end of the regex and manualy check using m.end(). The run method was refactored while I was at it, saving us from running the regex twice for each HR.
-rw-r--r-- | markdown/blockprocessors.py | 37 | ||||
-rw-r--r-- | tests/misc/para-with-hr.html | 5 | ||||
-rw-r--r-- | tests/misc/para-with-hr.txt | 3 |
3 files changed, 26 insertions, 19 deletions
diff --git a/markdown/blockprocessors.py b/markdown/blockprocessors.py index 7223da4..de3f136 100644 --- a/markdown/blockprocessors.py +++ b/markdown/blockprocessors.py @@ -460,35 +460,36 @@ class SetextHeaderProcessor(BlockProcessor): class HRProcessor(BlockProcessor): """ Process Horizontal Rules. """ - RE = r'[ ]{0,3}((-+[ ]{0,2}){3,}|(_+[ ]{0,2}){3,}|(\*+[ ]{0,2}){3,})[ ]*' + RE = r'^[ ]{0,3}((-+[ ]{0,2}){3,}|(_+[ ]{0,2}){3,}|(\*+[ ]{0,2}){3,})[ ]*' # Detect hr on any line of a block. - SEARCH_RE = re.compile(r'(^|\n)%s(\n|$)' % RE) - # Match a hr on a single line of text. - MATCH_RE = re.compile(r'^%s$' % RE) + SEARCH_RE = re.compile(RE, re.MULTILINE) def test(self, parent, block): - return bool(self.SEARCH_RE.search(block)) + m = self.SEARCH_RE.search(block) + # No atomic grouping in python so we simulate it here for performance. + # The regex only matches what would be in the atomic group - the HR. + # Then check if we are at end of block or if next char is a newline. + if m and (m.end() == len(block) or block[m.end()] == '\n'): + # Save match object on class instance so we can use it later. + self.match = m + return True + return False def run(self, parent, blocks): - lines = blocks.pop(0).split('\n') - prelines = [] + block = blocks.pop(0) # Check for lines in block before hr. - for line in lines: - m = self.MATCH_RE.match(line) - if m: - break - else: - prelines.append(line) - if len(prelines): + prelines = block[:self.match.start()].rstrip('\n') + if prelines: # Recursively parse lines before hr so they get parsed first. - self.parser.parseBlocks(parent, ['\n'.join(prelines)]) + self.parser.parseBlocks(parent, [prelines]) # create hr hr = util.etree.SubElement(parent, 'hr') # check for lines in block after hr. - lines = lines[len(prelines)+1:] - if len(lines): + postlines = block[self.match.end():].lstrip('\n') + if postlines: # Add lines after hr to master blocks for later parsing. - blocks.insert(0, '\n'.join(lines)) + blocks.insert(0, postlines) + class EmptyBlockProcessor(BlockProcessor): diff --git a/tests/misc/para-with-hr.html b/tests/misc/para-with-hr.html index 8569fec..7607449 100644 --- a/tests/misc/para-with-hr.html +++ b/tests/misc/para-with-hr.html @@ -1,3 +1,6 @@ <p>Here is a paragraph, followed by a horizontal rule.</p> <hr /> -<p>Followed by another paragraph.</p>
\ No newline at end of file +<p>Followed by another paragraph.</p> +<p>Here is another paragraph, followed by: +*** not an HR. +Followed by more of the same paragraph.</p>
\ No newline at end of file diff --git a/tests/misc/para-with-hr.txt b/tests/misc/para-with-hr.txt index 20735fb..165bbe3 100644 --- a/tests/misc/para-with-hr.txt +++ b/tests/misc/para-with-hr.txt @@ -2,3 +2,6 @@ Here is a paragraph, followed by a horizontal rule. *** Followed by another paragraph. +Here is another paragraph, followed by: +*** not an HR. +Followed by more of the same paragraph. |