From fc3020c68575042a1ff5426ddd94bc4be65bf77d Mon Sep 17 00:00:00 2001
From: Waylan Limberg <waylan@gmail.com>
Date: Sun, 3 Jan 2010 20:59:38 -0500
Subject: Fixed Ticket 48. Quoted attributes in raw html are specificly
 ackowledged now - allowing various arbitrary stuff (like x/html to be
 included without breaking the rawhtml parser.

Although currently unused, the code also provides the parsed attributes as a dict. Should be useful for adding support for parsing markdown text within rawhtml in an extension.
---
 markdown/preprocessors.py | 48 ++++++++++++++++++++++++++++++++++++++---------
 tests/misc/html.html      |  4 ++++
 tests/misc/html.txt       |  4 ++++
 3 files changed, 47 insertions(+), 9 deletions(-)
diff --git a/markdown/preprocessors.py b/markdown/preprocessors.py
index 5342d44..072784b 100644
--- a/markdown/preprocessors.py
+++ b/markdown/preprocessors.py
@@ -77,17 +77,41 @@ class HtmlBlockPreprocessor(Preprocessor):
     """Remove html blocks from the text and store them for later retrieval."""
 
     right_tag_patterns = ["</%s>", "%s>"]
+    attrs_pattern = r"""
+        \s+(?P<attr> [^>"'/ ]+)=(?P<q>['"])(?P<value>.*?)(?P=q)   # attr="value"
+        |                                                         # OR 
+        \s+(?P<sattr> [^>"'/ ]+)                                  # attr
+        """
+    left_tag_pattern = r'^\<(?P<tag>[^> ]+)(?P<attrs>(%s)*)\s*\/?\>?' % attrs_pattern
+    attrs_re = re.compile(attrs_pattern, re.VERBOSE)
+    left_tag_re = re.compile(left_tag_pattern, re.VERBOSE)
 
     def _get_left_tag(self, block):
-        return block[1:].replace(">", " ", 1).split()[0].lower()
+        m = self.left_tag_re.match(block)
+        if m:
+            tag = m.group('tag')
+            raw_attrs = m.group('attrs')
+            attrs = {}
+            if raw_attrs:
+                for ma in self.attrs_re.finditer(raw_attrs):
+                    if ma.group('value'):
+                        attrs[ma.group('attr').strip()] = ma.group('value')
+                    elif ma.group('attr'):
+                        attrs[ma.group('attr').strip()] = ""
+            return tag, len(m.group(0)), attrs
+        else:
+            tag = block[1:].replace(">", " ", 1).split()[0].lower()
+            return tag, len(tag+2), {}
 
-    def _get_right_tag(self, left_tag, block):
+        #return block[1:].replace(">", " ", 1).split()[0].lower()
+
+    def _get_right_tag(self, left_tag, left_index, block):
         for p in self.right_tag_patterns:
             tag = p % left_tag
             i = block.rfind(tag)
             if i > 2:
-                return tag.lstrip("<").rstrip(">"), i + len(p)-2 + len(left_tag)
-        return block.rstrip()[-len(left_tag)-2:-1].lower(), len(block)
+                return tag.lstrip("<").rstrip(">"), i + len(p)-2 + left_index-2
+        return block.rstrip()[-left_index:-1].lower(), len(block)
 
     def _equal_tags(self, left_tag, right_tag):
         if left_tag == 'div' or left_tag[0] in ['?', '@', '%']: # handle PHP, etc.
@@ -113,7 +137,7 @@ class HtmlBlockPreprocessor(Preprocessor):
         left_tag = ''
         right_tag = ''
         in_tag = False # flag
-
+        #import pdb; pdb.set_trace()
         while text:
             block = text[0]
             if block.startswith("\n"):
@@ -125,13 +149,17 @@ class HtmlBlockPreprocessor(Preprocessor):
 
             if not in_tag:
                 if block.startswith("<"):
-                    left_tag = self._get_left_tag(block)
-                    right_tag, data_index = self._get_right_tag(left_tag, block)
+                    left_tag, left_index, attrs = self._get_left_tag(block)
+                    right_tag, data_index = self._get_right_tag(left_tag, 
+                                                                left_index,
+                                                                block)
 
                     if block[1] == "!":
                         # is a comment block
                         left_tag = "--"
-                        right_tag, data_index = self._get_right_tag(left_tag, block)
+                        right_tag, data_index = self._get_right_tag(left_tag, 
+                                                                    left_index,
+                                                                    block)
                         # keep checking conditions below and maybe just append
                     
                     if data_index < len(block) \
@@ -171,7 +199,9 @@ class HtmlBlockPreprocessor(Preprocessor):
             else:
                 items.append(block)
 
-                right_tag, data_index = self._get_right_tag(left_tag, block)
+                right_tag, data_index = self._get_right_tag(left_tag, 
+                                                            left_index, 
+                                                            block)
 
                 if self._equal_tags(left_tag, right_tag):
                     # if find closing tag
diff --git a/tests/misc/html.html b/tests/misc/html.html
index 81ac5ee..cd6d4af 100644
--- a/tests/misc/html.html
+++ b/tests/misc/html.html
@@ -5,5 +5,9 @@
 <p>Now some <arbitrary>arbitrary tags</arbitrary>.</p>
 <div>More block level html.</div>
 
+<div class="foo bar" title="with 'quoted' text." valueless_attr weirdness="<i>foo</i>">
+Html with various attributes.
+</div>
+
 <p>And of course <script>blah</script>.</p>
 <p><a href="script&gt;stuff&lt;/script">this <script>link</a></p>
\ No newline at end of file
diff --git a/tests/misc/html.txt b/tests/misc/html.txt
index 3ac3ae0..c08fe1d 100644
--- a/tests/misc/html.txt
+++ b/tests/misc/html.txt
@@ -7,6 +7,10 @@ Now some <arbitrary>arbitrary tags</arbitrary>.
 
 <div>More block level html.</div>
 
+<div class="foo bar" title="with 'quoted' text." valueless_attr weirdness="<i>foo</i>">
+Html with various attributes.
+</div>
+
 And of course <script>blah</script>.
 
 [this <script>link](<script>stuff</script>)
-- 
cgit v1.2.3