diff options
-rw-r--r-- | markdown/preprocessors.py | 48 | ||||
-rw-r--r-- | tests/misc/html.html | 4 | ||||
-rw-r--r-- | tests/misc/html.txt | 4 |
3 files changed, 47 insertions, 9 deletions
diff --git a/markdown/preprocessors.py b/markdown/preprocessors.py index 5342d44..072784b 100644 --- a/markdown/preprocessors.py +++ b/markdown/preprocessors.py @@ -77,17 +77,41 @@ class HtmlBlockPreprocessor(Preprocessor): """Remove html blocks from the text and store them for later retrieval.""" right_tag_patterns = ["</%s>", "%s>"] + attrs_pattern = r""" + \s+(?P<attr> [^>"'/ ]+)=(?P<q>['"])(?P<value>.*?)(?P=q) # attr="value" + | # OR + \s+(?P<sattr> [^>"'/ ]+) # attr + """ + left_tag_pattern = r'^\<(?P<tag>[^> ]+)(?P<attrs>(%s)*)\s*\/?\>?' % attrs_pattern + attrs_re = re.compile(attrs_pattern, re.VERBOSE) + left_tag_re = re.compile(left_tag_pattern, re.VERBOSE) def _get_left_tag(self, block): - return block[1:].replace(">", " ", 1).split()[0].lower() + m = self.left_tag_re.match(block) + if m: + tag = m.group('tag') + raw_attrs = m.group('attrs') + attrs = {} + if raw_attrs: + for ma in self.attrs_re.finditer(raw_attrs): + if ma.group('value'): + attrs[ma.group('attr').strip()] = ma.group('value') + elif ma.group('attr'): + attrs[ma.group('attr').strip()] = "" + return tag, len(m.group(0)), attrs + else: + tag = block[1:].replace(">", " ", 1).split()[0].lower() + return tag, len(tag+2), {} - def _get_right_tag(self, left_tag, block): + #return block[1:].replace(">", " ", 1).split()[0].lower() + + def _get_right_tag(self, left_tag, left_index, block): for p in self.right_tag_patterns: tag = p % left_tag i = block.rfind(tag) if i > 2: - return tag.lstrip("<").rstrip(">"), i + len(p)-2 + len(left_tag) - return block.rstrip()[-len(left_tag)-2:-1].lower(), len(block) + return tag.lstrip("<").rstrip(">"), i + len(p)-2 + left_index-2 + return block.rstrip()[-left_index:-1].lower(), len(block) def _equal_tags(self, left_tag, right_tag): if left_tag == 'div' or left_tag[0] in ['?', '@', '%']: # handle PHP, etc. @@ -113,7 +137,7 @@ class HtmlBlockPreprocessor(Preprocessor): left_tag = '' right_tag = '' in_tag = False # flag - + #import pdb; pdb.set_trace() while text: block = text[0] if block.startswith("\n"): @@ -125,13 +149,17 @@ class HtmlBlockPreprocessor(Preprocessor): if not in_tag: if block.startswith("<"): - left_tag = self._get_left_tag(block) - right_tag, data_index = self._get_right_tag(left_tag, block) + left_tag, left_index, attrs = self._get_left_tag(block) + right_tag, data_index = self._get_right_tag(left_tag, + left_index, + block) if block[1] == "!": # is a comment block left_tag = "--" - right_tag, data_index = self._get_right_tag(left_tag, block) + right_tag, data_index = self._get_right_tag(left_tag, + left_index, + block) # keep checking conditions below and maybe just append if data_index < len(block) \ @@ -171,7 +199,9 @@ class HtmlBlockPreprocessor(Preprocessor): else: items.append(block) - right_tag, data_index = self._get_right_tag(left_tag, block) + right_tag, data_index = self._get_right_tag(left_tag, + left_index, + block) if self._equal_tags(left_tag, right_tag): # if find closing tag diff --git a/tests/misc/html.html b/tests/misc/html.html index 81ac5ee..cd6d4af 100644 --- a/tests/misc/html.html +++ b/tests/misc/html.html @@ -5,5 +5,9 @@ <p>Now some <arbitrary>arbitrary tags</arbitrary>.</p> <div>More block level html.</div> +<div class="foo bar" title="with 'quoted' text." valueless_attr weirdness="<i>foo</i>"> +Html with various attributes. +</div> + <p>And of course <script>blah</script>.</p> <p><a href="script>stuff</script">this <script>link</a></p>
\ No newline at end of file diff --git a/tests/misc/html.txt b/tests/misc/html.txt index 3ac3ae0..c08fe1d 100644 --- a/tests/misc/html.txt +++ b/tests/misc/html.txt @@ -7,6 +7,10 @@ Now some <arbitrary>arbitrary tags</arbitrary>. <div>More block level html.</div> +<div class="foo bar" title="with 'quoted' text." valueless_attr weirdness="<i>foo</i>"> +Html with various attributes. +</div> + And of course <script>blah</script>. [this <script>link](<script>stuff</script>) |