Fixed various issues with the core parser - mostly whitespace related and updated a few tests that weren't quite right - that is they now better match pl or php implementations.

author: Waylan Limberg <waylan@gmail.com> 2008-11-12 23:00:31 -0500
committer: Waylan Limberg <waylan@gmail.com> 2008-11-13 23:25:15 -0500
commit: 8d367ca71c610c49d3b3d5c81f49cb38f9e97fb9 (patch)
tree: 77823e3c677e403382dd22473e688fb93568c352
parent: ba147ca9b2eae544e802c8216936065d2d86a8d8 (diff)
download: markdown-8d367ca71c610c49d3b3d5c81f49cb38f9e97fb9.tar.gz
markdown-8d367ca71c610c49d3b3d5c81f49cb38f9e97fb9.tar.bz2
markdown-8d367ca71c610c49d3b3d5c81f49cb38f9e97fb9.zip
3 files changed, 86 insertions, 36 deletions
diff --git a/markdown.py b/markdown.py
index b40093f..9fc6355 100755
--- a/markdown.py
+++ b/markdown.py
@@ -240,17 +240,26 @@ class ListIndentProcessor(BlockProcessor):
     """ Process children of list items. """
 
     def test(self, parent, block):
-        return block.startswith(' '*4) and parent[-1] and \
-                (parent[-1].tag == "ul" or parent[-1].tag == "ol")
+        return block.startswith(' '*4) and \
+                (parent.tag == "li" or \
+                    (len(parent) and parent[-1] and \
+                        (parent[-1].tag == "ul" or parent[-1].tag == "ol")
+                    )
+                )
 
     def run(self, parent, blocks):
-        block = blocks.pop(0)
+        block = self.looseDetab(blocks.pop(0))
         sibling = self.lastChild(parent)
-        if len(sibling) and sibling[-1].tag == 'li':
-            self.parser.parseBlocks(sibling[-1], [self.looseDetab(block)])
+        if parent.tag == 'li':
+            self.parser.parseBlocks(parent, [block])
+        elif len(sibling) and sibling[-1].tag == 'li':
+            if sibling[-1].text:
+                block = '%s\n\n%s' % (sibling[-1].text, block)
+                sibling[-1].text = ''
+            self.parser.parseChunk(sibling[-1], block)
         else:
             li = etree.SubElement(sibling, 'li')
-            self.parser.parseBlocks(li, [self.looseDetab(block)])
+            self.parser.parseBlocks(li, [block])
 
 
 class CodeBlockProcessor(BlockProcessor):
@@ -267,12 +276,12 @@ class CodeBlockProcessor(BlockProcessor):
                     and sibling[0].tag == "code":
             code = sibling[0]
             block, theRest = self.detab(block)
-            code.text = '%s\n%s\n' % (code.text, block.rstrip())
+            code.text = AtomicString('%s\n%s\n' % (code.text, block.rstrip()))
         else:
             pre = etree.SubElement(parent, 'pre')
             code = etree.SubElement(pre, 'code')
             block, theRest = self.detab(block)
-            code.text = '%s\n' % block.rstrip()
+            code.text = AtomicString('%s\n' % block.rstrip())
         if theRest:
             blocks.insert(0, theRest)
 
@@ -292,15 +301,15 @@ class BlockQuoteProcessor(BlockProcessor):
             quote = sibling
         else:
             quote = etree.SubElement(parent, 'blockquote')
-        self.parser.parseBlocks(quote, [block])
+        self.parser.parseChunk(quote, block)
 
     def clean(self, line):
         """ Remove ``>`` from begining of a line. """
         m = self.RE.match(line)
-        if m:
-            return m.group(1)
-        elif line.strip() == ">":
+        if line.strip() == ">":
             return ""
+        elif m:
+            return m.group(1)
         else:
             return line
 
@@ -318,7 +327,7 @@ class OListProcessor(BlockProcessor):
         sibling = self.lastChild(parent)
         if sibling and sibling.tag == self.TAG:
             lst = sibling
-            # make sure previous item is in a p. 
+            # make sure previous item is in a p.
             if len(lst) and lst[-1].text and not len(lst[-1]):
                 p = etree.SubElement(lst[-1], 'p')
                 p.text = lst[-1].text
@@ -333,8 +342,11 @@ class OListProcessor(BlockProcessor):
             lst = etree.SubElement(parent, self.TAG)
         self.parser.state = 'list'
         for item in items:
-            li = etree.SubElement(lst, 'li')
-            self.parser.parseBlocks(li, [item])
+            if item.startswith(' '*4):
+                self.parser.parseBlocks(lst[-1], [item])
+            else:
+                li = etree.SubElement(lst, 'li')
+                self.parser.parseBlocks(li, [item])
         self.parser.resetState()
 
     def get_items(self, block):
@@ -344,6 +356,11 @@ class OListProcessor(BlockProcessor):
             m = self.RE.match(line)
             if m:
                 items.append(m.group(1))
+            elif line.startswith(' '*4):
+                if items[-1].startswith(' '*4):
+                    items[-1] = '%s\n%s' % (items[-1], line)
+                else:
+                    items.append(line)
             else:
                 items[-1] = '\n'.join([items[-1], line])
         return items
@@ -359,22 +376,25 @@ class UListProcessor(OListProcessor):
 class HashHeaderProcessor(BlockProcessor):
     """ Process Hash Headers. """
 
-    RE = re.compile(r'^(#{1,6})(.*?)#*$')
+    RE = re.compile(r'(^|\n)(?P<level>#{1,6})(?P<header>.*?)#*(\n|$)')
 
     def test(self, parent, block):
-        return block.startswith('#')
+        return bool(self.RE.search(block))
 
     def run(self, parent, blocks):
-        lines = blocks.pop(0).split('\n')
-        line1 = lines.pop(0)
-        m = self.RE.match(line1)
+        block = blocks.pop(0)
+        m = self.RE.search(block)
         if m:
-            h = etree.SubElement(parent, 'h%d' % len(m.group(1)))
-            h.text = m.group(2).strip()
+            before = block[:m.start()]
+            after = block[m.end():]
+            if before:
+                self.parser.parseBlocks(parent, [before])
+            h = etree.SubElement(parent, 'h%d' % len(m.group('level')))
+            h.text = m.group('header').strip()
+            if after:
+                blocks.insert(0, after)
         else:
-            lines.insert(0, line1)
-        if len(lines):
-            blocks.insert(0, '\n'.join(lines))
+            message(CRITICAL, "We've got a problem header!")
 
 
 class SHeaderProcessor(BlockProcessor):
@@ -400,17 +420,20 @@ class SHeaderProcessor(BlockProcessor):
 class HRProcessor(BlockProcessor):
     """ Process Horizontal Rules. """
 
-    RE = re.compile(r'([*_-][ ]?){3,}')
+    RE = r'[ ]{0,3}(?P<ch>[*_-])[ ]?((?P=ch)[ ]?){2,}[ ]*'
+    SEARCH_RE = re.compile(r'(^|\n)%s(\n|$)' % RE)
+    MATCH_RE = re.compile(r'^%s$' % RE)
 
     def test(self, parent, block):
-        return bool(self.RE.search(block))
+        return bool(self.SEARCH_RE.search(block))
 
     def run(self, parent, blocks):
         # Check for lines in block before hr.
+        #import ipdb; ipdb.set_trace()
         lines = blocks.pop(0).split('\n')
         prelines = []
         for line in lines:
-            m = self.RE.match(line)
+            m = self.MATCH_RE.match(line)
             if m:
                 break
             else:
@@ -425,6 +448,25 @@ class HRProcessor(BlockProcessor):
             blocks.insert(0, '\n'.join(lines))
 
 
+class EmptyBlockProcessor(BlockProcessor):
+    """ Process blocks and start with an empty line. """
+
+    RE = re.compile(r'^\s*\n')
+
+    def test(self, parent, block):
+        return bool(self.RE.match(block))
+
+    def run(self, parent, blocks):
+        block = blocks.pop(0)
+        m = self.RE.match(block)
+        if m:
+            blocks.insert(0, block[m.end():])
+            sibling = self.lastChild(parent)
+            if sibling and sibling.tag == 'pre' and sibling[0] and \
+                    sibling[0].tag == 'code':
+                sibling[0].text = AtomicString('%s/n/n/n' % sibling[0].text )
+
+
 class PBlockProcessor(BlockProcessor):
     """ Process Paragraph blocks. """
 
@@ -435,7 +477,10 @@ class PBlockProcessor(BlockProcessor):
         block = blocks.pop(0)
         if block.strip():
             if self.parser.state == 'list':
-                parent.text = block
+                if parent.text:
+                    parent.text = '%s\n%s' % (parent.text, block)
+                else:
+                    parent.text = block
             else:
                 p = etree.SubElement(parent, 'p')
                 p.text = block
@@ -446,6 +491,7 @@ class BlockParser:
 
     def __init__(self):
         self.blockprocessors = OrderedDict()
+        self.blockprocessors['empty'] = EmptyBlockProcessor(self)
         self.blockprocessors['indent'] = ListIndentProcessor(self)
         self.blockprocessors['code'] = CodeBlockProcessor(self)
         self.blockprocessors['hashheader'] = HashHeaderProcessor(self)
@@ -464,10 +510,13 @@ class BlockParser:
         """ Parse a markdown string into an ElementTree. """
         # Create a ElementTree from the lines
         root = etree.Element("div")
-        blocks = '\n'.join(lines).split('\n\n')
-        self.parseBlocks(root, blocks)
+        self.parseChunk(root, '\n'.join(lines))
         return etree.ElementTree(root)
 
+    def parseChunk(self, parent, text):
+        """ Parse a chunk of markdown text and attach to given etree node. """
+        self.parseBlocks(parent, text.split('\n\n'))
+
     def parseBlocks(self, parent, blocks):
         """ Process blocks of markdown text and attach to given etree node. """
         while blocks:
@@ -1722,6 +1771,7 @@ class Markdown:
 
         source = source.replace(STX, "").replace(ETX, "")
         source = source.replace("\r\n", "\n").replace("\r", "\n") + "\n\n"
+        source = re.sub(r'\n\s\n', '\n\n', source)
         source = source.expandtabs(TAB_LENGTH)
 
         # Split into lines and run the line preprocessors.
diff --git a/tests/misc/blockquote.html b/tests/misc/blockquote.html
index b52e92d..4481d51 100644
--- a/tests/misc/blockquote.html
+++ b/tests/misc/blockquote.html
@@ -10,13 +10,15 @@
 <p>blockquote with 2 spaces.</p>
 </blockquote>
 <p>baz</p>
-<p>   &gt; this has three spaces so its a paragraph.</p>
+<blockquote>
+<p>this has three spaces so its a paragraph.</p>
+</blockquote>
 <p>blah</p>
 <pre><code>&gt; this one had four so it's a code block.
 </code></pre>
 <blockquote>
 <blockquote>
 <p>this nested blockquote has 0 on level one and 3 (one after the first <code>&gt;</code> + 2 more) on level 2.</p>
+<p>and this has 4 on level 2 - another code block.</p>
 </blockquote>
-<p>   &gt; and this has 4 on level 2 - another code block.</p>
 </blockquote>
 \ No newline at end of file
diff --git a/tests/misc/multi-paragraph-block-quote.html b/tests/misc/multi-paragraph-block-quote.html
index 3602405..e13986a 100644
--- a/tests/misc/multi-paragraph-block-quote.html
+++ b/tests/misc/multi-paragraph-block-quote.html
@@ -1,8 +1,6 @@
 <blockquote>
 <p>This is line one of paragraph one
-This is line two of paragraph one</p>
+ This is line two of paragraph one</p>
 <p>This is line one of paragraph two</p>
-</blockquote>
-<blockquote>
 <p>This is another blockquote.</p>
 </blockquote>
 \ No newline at end of file
author	Waylan Limberg <waylan@gmail.com>	2008-11-12 23:00:31 -0500
committer	Waylan Limberg <waylan@gmail.com>	2008-11-13 23:25:15 -0500
commit	8d367ca71c610c49d3b3d5c81f49cb38f9e97fb9 (patch)
tree	77823e3c677e403382dd22473e688fb93568c352
parent	ba147ca9b2eae544e802c8216936065d2d86a8d8 (diff)
download	markdown-8d367ca71c610c49d3b3d5c81f49cb38f9e97fb9.tar.gz markdown-8d367ca71c610c49d3b3d5c81f49cb38f9e97fb9.tar.bz2 markdown-8d367ca71c610c49d3b3d5c81f49cb38f9e97fb9.zip