From d18c3d0acab0e7469c3284c897afcb61f9dd1fea Mon Sep 17 00:00:00 2001 From: Isaac Muse Date: Wed, 17 Jan 2018 18:36:34 -0700 Subject: Flexible inline (#629) Add new InlineProcessor class that handles inline processing much better and allows for more flexibility. This adds new InlineProcessors that no longer utilize unnecessary pretext and posttext captures. New class can accept the buffer that is being worked on and manually process the text without regex and return new replacement bounds. This helps us to handle links in a better way and handle nested brackets and logic that is too much for regular expression. The refactor also allows image links to have links/paths with spaces like links. Ref #551, #613, #590, #161. --- docs/extensions/api.md | 109 +++++++- markdown/extensions/abbr.py | 12 +- markdown/extensions/footnotes.py | 16 +- markdown/extensions/nl2br.py | 4 +- markdown/extensions/smart_strong.py | 10 +- markdown/extensions/smarty.py | 18 +- markdown/extensions/wikilinks.py | 16 +- markdown/inlinepatterns.py | 483 ++++++++++++++++++++++++++--------- markdown/treeprocessors.py | 35 ++- markdown/util.py | 2 + tests/misc/image.html | 5 - tests/misc/image.txt | 12 - tests/test_apis.py | 10 +- tests/test_syntax/inline/__init__.py | 0 tests/test_syntax/inline/images.py | 139 ++++++++++ tests/test_syntax/inline/links.py | 98 +++++++ 16 files changed, 785 insertions(+), 184 deletions(-) delete mode 100644 tests/misc/image.html delete mode 100644 tests/misc/image.txt create mode 100644 tests/test_syntax/inline/__init__.py create mode 100644 tests/test_syntax/inline/images.py create mode 100644 tests/test_syntax/inline/links.py diff --git a/docs/extensions/api.md b/docs/extensions/api.md index cba4ea7..ad0d254 100644 --- a/docs/extensions/api.md +++ b/docs/extensions/api.md @@ -48,6 +48,8 @@ class MyPreprocessor(Preprocessor): ## Inline Patterns {: #inlinepatterns } +### Legacy + Inline Patterns implement the inline HTML element syntax for Markdown such as `*emphasis*` or `[links](http://example.com)`. Pattern objects should be instances of classes that inherit from `markdown.inlinepatterns.Pattern` or @@ -85,7 +87,7 @@ from markdown.util import etree class EmphasisPattern(Pattern): def handleMatch(self, m): el = etree.Element('em') - el.text = m.group(3) + el.text = m.group(2) return el ``` @@ -110,8 +112,113 @@ implemented with separate instances of the `SimpleTagPattern` listed below. Feel free to use or extend any of the Pattern classes found at `markdown.inlinepatterns`. +### Future + +While users can still create plugins with the existing +`markdown.inlinepatterns.Pattern`, a new, more flexible inline processor has +been added which users are encouraged to migrate to. The new inline processor +is found at `markdown.inlinepatterns.InlineProcessor`. + +The new processor is very similar to legacy with two major distinctions. + +1. Patterns no longer need to match the entire block, so patterns no longer + start with `r'^(.*?)'` and end with `r'(.*?)!'`. This was a huge + performance sink and this requirement has been removed. The returned match + object will only contain what is explicitly matched in the pattern, and + extension pattern groups now start with `m.group(1)`. + +2. The `handleMatch` method now takes an additional input called `data`, + which is the entire block under analysis, not just what is matched with + the specified pattern. The method also returns the element *and* the index + boundaries relative to `data` that the return element is replacing + (usually `m.start(0)` and `m.end(0)`). If the boundaries are returned as + `None`, it is assumed that the match did not take place, and nothing will + be altered in `data`. + +If all you need is the same functionality as the legacy processor, you can do +as shown below. Most of the time, simple regular expression processing is all +you'll need. + +```python +from markdown.inlinepatterns import InlineProcessor +from markdown.util import etree + +# an oversimplified regex +MYPATTERN = r'\*([^*]+)\*' + +class EmphasisPattern(InlineProcessor): + def handleMatch(self, m, data): + el = etree.Element('em') + el.text = m.group(1) + return el, m.start(0), m.end(0) + +# pass in pattern and create instance +emphasis = EmphasisPattern(MYPATTERN) +``` + +But, the new processor allows you handle much more complex patterns that are +too much for Python's Re to handle. For instance, to handle nested brackets in +link patterns, the built-in link inline processor uses the following pattern to +find where a link *might* start: + +```python +LINK_RE = NOIMG + r'\[' +link = LinkInlineProcessor(LINK_RE, md_instance) +``` + +It then uses programmed logic to actually walk the string (`data`), starting at +where the match started (`m.start(0)`). If for whatever reason, the text +does not appear to be a link, it returns `None` for the start and end boundary +in order to communicate to the parser that no match was found. + +```python + # Just a snippet of of the link's handleMatch + # method to illustrate new logic + def handleMatch(self, m, data): + text, index, handled = self.getText(data, m.end(0)) + + if not handled: + return None, None, None + + href, title, index, handled = self.getLink(data, index) + if not handled: + return None, None, None + + el = util.etree.Element("a") + el.text = text + + el.set("href", href) + + if title is not None: + el.set("title", title) + + return el, m.start(0), index +``` + ### Generic Pattern Classes +Some example processors that are available. + +* **`SimpleTextInlineProcessor(pattern)`**: + + Returns simple text of `group(2)` of a `pattern` and the start and end + position of the match. + +* **`SimpleTagInlineProcessor(pattern, tag)`**: + + Returns an element of type "`tag`" with a text attribute of `group(3)` + of a `pattern`. `tag` should be a string of a HTML element (i.e.: 'em'). + It also returns the start and end position of the match. + +* **`SubstituteTagInlineProcessor(pattern, tag)`**: + + Returns an element of type "`tag`" with no children or text (i.e.: `br`) + and the start and end position of the match. + +A very small number of the basic legacy processors are still available to +prevent breakage of 3rd party extensions during the transition period to the +new processors. Three of the available processors are listed below. + * **`SimpleTextPattern(pattern)`**: Returns simple text of `group(2)` of a `pattern`. diff --git a/markdown/extensions/abbr.py b/markdown/extensions/abbr.py index 2553aac..5e8845b 100644 --- a/markdown/extensions/abbr.py +++ b/markdown/extensions/abbr.py @@ -20,7 +20,7 @@ from __future__ import absolute_import from __future__ import unicode_literals from . import Extension from ..preprocessors import Preprocessor -from ..inlinepatterns import Pattern +from ..inlinepatterns import InlineProcessor from ..util import etree, AtomicString import re @@ -52,7 +52,7 @@ class AbbrPreprocessor(Preprocessor): abbr = m.group('abbr').strip() title = m.group('title').strip() self.markdown.inlinePatterns['abbr-%s' % abbr] = \ - AbbrPattern(self._generate_pattern(abbr), title) + AbbrInlineProcessor(self._generate_pattern(abbr), title) # Preserve the line to prevent raw HTML indexing issue. # https://github.com/Python-Markdown/markdown/issues/584 new_text.append('') @@ -76,18 +76,18 @@ class AbbrPreprocessor(Preprocessor): return r'(?P\b%s\b)' % (r''.join(chars)) -class AbbrPattern(Pattern): +class AbbrInlineProcessor(InlineProcessor): """ Abbreviation inline pattern. """ def __init__(self, pattern, title): - super(AbbrPattern, self).__init__(pattern) + super(AbbrInlineProcessor, self).__init__(pattern) self.title = title - def handleMatch(self, m): + def handleMatch(self, m, data): abbr = etree.Element('abbr') abbr.text = AtomicString(m.group('abbr')) abbr.set('title', self.title) - return abbr + return abbr, m.start(0), m.end(0) def makeExtension(**kwargs): # pragma: no cover diff --git a/markdown/extensions/footnotes.py b/markdown/extensions/footnotes.py index d16cf84..a957278 100644 --- a/markdown/extensions/footnotes.py +++ b/markdown/extensions/footnotes.py @@ -17,7 +17,7 @@ from __future__ import absolute_import from __future__ import unicode_literals from . import Extension from ..preprocessors import Preprocessor -from ..inlinepatterns import Pattern +from ..inlinepatterns import InlineProcessor from ..treeprocessors import Treeprocessor from ..postprocessors import Postprocessor from .. import util @@ -77,7 +77,7 @@ class FootnoteExtension(Extension): # Insert an inline pattern before ImageReferencePattern FOOTNOTE_RE = r'\[\^([^\]]*)\]' # blah blah [^1] blah md.inlinePatterns.add( - "footnote", FootnotePattern(FOOTNOTE_RE, self), "emphasis2' ) diff --git a/markdown/extensions/smarty.py b/markdown/extensions/smarty.py index 35c78a8..189651f 100644 --- a/markdown/extensions/smarty.py +++ b/markdown/extensions/smarty.py @@ -83,7 +83,7 @@ smartypants.py license: from __future__ import unicode_literals from . import Extension -from ..inlinepatterns import HtmlPattern, HTML_RE +from ..inlinepatterns import HtmlInlineProcessor, HTML_RE from ..odict import OrderedDict from ..treeprocessors import InlineProcessor @@ -150,21 +150,21 @@ remainingDoubleQuotesRegex = r'"' HTML_STRICT_RE = HTML_RE + r'(?!\>)' -class SubstituteTextPattern(HtmlPattern): +class SubstituteTextPattern(HtmlInlineProcessor): def __init__(self, pattern, replace, markdown_instance): """ Replaces matches with some text. """ - HtmlPattern.__init__(self, pattern) + HtmlInlineProcessor.__init__(self, pattern) self.replace = replace self.markdown = markdown_instance - def handleMatch(self, m): + def handleMatch(self, m, data): result = '' for part in self.replace: if isinstance(part, int): result += m.group(part) else: result += self.markdown.htmlStash.store(part) - return result + return result, m.start(0), m.end(0) class SmartyExtension(Extension): @@ -233,11 +233,11 @@ class SmartyExtension(Extension): (doubleQuoteSetsRe, (ldquo + lsquo,)), (singleQuoteSetsRe, (lsquo + ldquo,)), (decadeAbbrRe, (rsquo,)), - (openingSingleQuotesRegex, (2, lsquo)), + (openingSingleQuotesRegex, (1, lsquo)), (closingSingleQuotesRegex, (rsquo,)), - (closingSingleQuotesRegex2, (rsquo, 2)), + (closingSingleQuotesRegex2, (rsquo, 1)), (remainingSingleQuotesRegex, (lsquo,)), - (openingDoubleQuotesRegex, (2, ldquo)), + (openingDoubleQuotesRegex, (1, ldquo)), (closingDoubleQuotesRegex, (rdquo,)), (closingDoubleQuotesRegex2, (rdquo,)), (remainingDoubleQuotesRegex, (ldquo,)) @@ -255,7 +255,7 @@ class SmartyExtension(Extension): self.educateAngledQuotes(md) # Override HTML_RE from inlinepatterns.py so that it does not # process tags with duplicate closing quotes. - md.inlinePatterns["html"] = HtmlPattern(HTML_STRICT_RE, md) + md.inlinePatterns["html"] = HtmlInlineProcessor(HTML_STRICT_RE, md) if configs['smart_dashes']: self.educateDashes(md) inlineProcessor = InlineProcessor(md) diff --git a/markdown/extensions/wikilinks.py b/markdown/extensions/wikilinks.py index a4a3515..b535d9c 100644 --- a/markdown/extensions/wikilinks.py +++ b/markdown/extensions/wikilinks.py @@ -18,7 +18,7 @@ License: [BSD](http://www.opensource.org/licenses/bsd-license.php) from __future__ import absolute_import from __future__ import unicode_literals from . import Extension -from ..inlinepatterns import Pattern +from ..inlinepatterns import InlineProcessor from ..util import etree import re @@ -46,20 +46,20 @@ class WikiLinkExtension(Extension): # append to end of inline patterns WIKILINK_RE = r'\[\[([\w0-9_ -]+)\]\]' - wikilinkPattern = WikiLinks(WIKILINK_RE, self.getConfigs()) + wikilinkPattern = WikiLinksInlineProcessor(WIKILINK_RE, self.getConfigs()) wikilinkPattern.md = md md.inlinePatterns.add('wikilink', wikilinkPattern, ") or [text](url "title") -LINK_RE = NOIMG + BRK + \ - r'''\(\s*(<.*?>|((?:(?:\(.*?\))|[^\(\)]))*?)\s*((['"])(.*?)\12\s*)?\)''' +LINK_RE = NOIMG + r'\[' # ![alttxt](http://x.com/) or ![alttxt]() -IMAGE_LINK_RE = r'\!' + BRK + r'\s*\(\s*(<.*?>|([^"\)\s]+\s*"[^"]*"|[^\)\s]*))\s*\)' +IMAGE_LINK_RE = r'\!\[' # [Google][3] -REFERENCE_RE = NOIMG + BRK + r'\s?\[([^\]]*)\]' - -# [Google] -SHORT_REF_RE = NOIMG + r'\[([^\]]+)\]' +REFERENCE_RE = LINK_RE # ![alt text][2] -IMAGE_REFERENCE_RE = r'\!' + BRK + r'\s?\[([^\]]*)\]' +IMAGE_REFERENCE_RE = IMAGE_LINK_RE # stand-alone * or _ NOT_STRONG_RE = r'((^| )(\*|_)( |$))' @@ -172,6 +161,7 @@ def handleAttributes(text, parent): """Set values of an element based on attribute definitions ({@id=123}).""" def attributeCallback(match): parent.set(match.group(1), match.group(2).replace('\n', ' ')) + return '' return ATTR_RE.sub(attributeCallback, text) @@ -181,7 +171,7 @@ The pattern classes """ -class Pattern(object): +class Pattern(object): # pragma: no cover """Base class that inline patterns subclass. """ ANCESTOR_EXCLUDES = tuple() @@ -241,24 +231,79 @@ class Pattern(object): return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text) -class SimpleTextPattern(Pattern): +class InlineProcessor(Pattern): + """ + Base class that inline patterns subclass. + + This is the newer style inline processor that uses a more + efficient and flexible search approach. + """ + + def __init__(self, pattern, markdown_instance=None): + """ + Create an instant of an inline pattern. + + Keyword arguments: + + * pattern: A regular expression that matches a pattern + + """ + self.pattern = pattern + self.compiled_re = re.compile(pattern, re.DOTALL | re.UNICODE) + + # Api for Markdown to pass safe_mode into instance + self.safe_mode = False + if markdown_instance: + self.markdown = markdown_instance + + def handleMatch(self, m, data): + """Return a ElementTree element from the given match and the + start and end index of the matched text. + + If `start` and/or `end` are returned as `None`, it will be + assumed that the processor did not find a valid region of text. + + Subclasses should override this method. + + Keyword arguments: + + * m: A re match object containing a match of the pattern. + * data: The buffer current under analysis + + Returns: + + * el: The ElementTree element, text or None. + * start: The start of the region that has been matched or None. + * end: The end of the region that has been matched or None. + + """ + pass # pragma: no cover + + +class SimpleTextPattern(Pattern): # pragma: no cover """ Return a simple text of group(2) of a Pattern. """ def handleMatch(self, m): return m.group(2) -class EscapePattern(Pattern): +class SimpleTextInlineProcessor(InlineProcessor): + """ Return a simple text of group(1) of a Pattern. """ + def handleMatch(self, m, data): + return m.group(1), m.start(0), m.end(0) + + +class EscapeInlineProcessor(InlineProcessor): """ Return an escaped character. """ - def handleMatch(self, m): - char = m.group(2) + def handleMatch(self, m, data): + char = m.group(1) if char in self.markdown.ESCAPED_CHARS: - return '%s%s%s' % (util.STX, ord(char), util.ETX) + return '%s%s%s' % (util.STX, ord(char), util.ETX), m.start(0), m.end(0) else: - return None + return None, m.start(0), m.end(0) -class SimpleTagPattern(Pattern): +class SimpleTagPattern(Pattern): # pragma: no cover """ Return element of type `tag` with a text attribute of group(3) of a Pattern. @@ -274,29 +319,51 @@ class SimpleTagPattern(Pattern): return el -class SubstituteTagPattern(SimpleTagPattern): +class SimpleTagInlineProcessor(InlineProcessor): + """ + Return element of type `tag` with a text attribute of group(2) + of a Pattern. + + """ + def __init__(self, pattern, tag): + InlineProcessor.__init__(self, pattern) + self.tag = tag + + def handleMatch(self, m, data): + el = util.etree.Element(self.tag) + el.text = m.group(2) + return el, m.start(0), m.end(0) + + +class SubstituteTagPattern(SimpleTagPattern): # pragma: no cover """ Return an element of type `tag` with no children. """ def handleMatch(self, m): return util.etree.Element(self.tag) -class BacktickPattern(Pattern): +class SubstituteTagInlineProcessor(SimpleTagInlineProcessor): + """ Return an element of type `tag` with no children. """ + def handleMatch(self, m, data): + return util.etree.Element(self.tag), m.start(0), m.end(0) + + +class BacktickInlineProcessor(InlineProcessor): """ Return a `` element containing the matching text. """ def __init__(self, pattern): - Pattern.__init__(self, pattern) + InlineProcessor.__init__(self, pattern) self.ESCAPED_BSLASH = '%s%s%s' % (util.STX, ord('\\'), util.ETX) self.tag = 'code' - def handleMatch(self, m): - if m.group(4): + def handleMatch(self, m, data): + if m.group(3): el = util.etree.Element(self.tag) - el.text = util.AtomicString(m.group(4).strip()) - return el + el.text = util.AtomicString(m.group(3).strip()) + return el, m.start(0), m.end(0) else: - return m.group(2).replace('\\\\', self.ESCAPED_BSLASH) + return m.group(1).replace('\\\\', self.ESCAPED_BSLASH), m.start(0), m.end(0) -class DoubleTagPattern(SimpleTagPattern): +class DoubleTagPattern(SimpleTagPattern): # pragma: no cover """Return a ElementTree element nested in tag2 nested in tag1. Useful for strong emphasis etc. @@ -312,12 +379,28 @@ class DoubleTagPattern(SimpleTagPattern): return el1 -class HtmlPattern(Pattern): +class DoubleTagInlineProcessor(SimpleTagInlineProcessor): + """Return a ElementTree element nested in tag2 nested in tag1. + + Useful for strong emphasis etc. + + """ + def handleMatch(self, m, data): + tag1, tag2 = self.tag.split(",") + el1 = util.etree.Element(tag1) + el2 = util.etree.SubElement(el1, tag2) + el2.text = m.group(2) + if len(m.groups()) == 3: + el2.tail = m.group(3) + return el1, m.start(0), m.end(0) + + +class HtmlInlineProcessor(InlineProcessor): """ Store raw inline html and return a placeholder. """ - def handleMatch(self, m): - rawhtml = self.unescape(m.group(2)) + def handleMatch(self, m, data): + rawhtml = self.unescape(m.group(1)) place_holder = self.markdown.htmlStash.store(rawhtml) - return place_holder + return place_holder, m.start(0), m.end(0) def unescape(self, text): """ Return unescaped text given text with an inline placeholder. """ @@ -338,74 +421,234 @@ class HtmlPattern(Pattern): return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text) -class LinkPattern(Pattern): +class LinkInlineProcessor(InlineProcessor): """ Return a link element from the given match. """ - def handleMatch(self, m): + RE_LINK = re.compile(r'''\(\s*(?:(<.*?>)\s*(?:(['"])(.*?)\2\s*)?\))?''', re.DOTALL | re.UNICODE) + RE_TITLE_CLEAN = re.compile(r'\s') + + def handleMatch(self, m, data): + text, index, handled = self.getText(data, m.end(0)) + + if not handled: + return None, None, None + + href, title, index, handled = self.getLink(data, index) + if not handled: + return None, None, None + el = util.etree.Element("a") - el.text = m.group(2) - title = m.group(13) - href = m.group(9) + el.text = text - if href: - if href[0] == "<": - href = href[1:-1] - el.set("href", self.unescape(href.strip())) - else: - el.set("href", "") + el.set("href", href) - if title: - title = dequote(self.unescape(title)) + if title is not None: el.set("title", title) - return el + return el, m.start(0), index + + def getLink(self, data, index): + """Parse data between `()` of `[Text]()` allowing recursive `()`. """ + + href = '' + title = None + handled = False + + m = self.RE_LINK.match(data, pos=index) + if m and m.group(1): + # Matches [Text]( "title") + href = m.group(1)[1:-1].strip() + if m.group(3): + title = m.group(3) + index = m.end(0) + handled = True + elif m: + # Track bracket nesting and index in string + bracket_count = 1 + backtrack_count = 1 + start_index = m.end() + index = start_index + last_bracket = -1 + + # Primary (first found) quote tracking. + quote = None + start_quote = -1 + exit_quote = -1 + ignore_matches = False + + # Secondary (second found) quote tracking. + alt_quote = None + start_alt_quote = -1 + exit_alt_quote = -1 + + # Track last character + last = '' + + for pos in util.iterrange(index, len(data)): + c = data[pos] + if c == '(': + # Count nested ( + # Don't increment the bracket count if we are sure we're in a title. + if not ignore_matches: + bracket_count += 1 + elif backtrack_count > 0: + backtrack_count -= 1 + elif c == ')': + # Match nested ) to ( + # Don't decrement if we are sure we are in a title that is unclosed. + if ((exit_quote != -1 and quote == last) or (exit_alt_quote != -1 and alt_quote == last)): + bracket_count = 0 + elif not ignore_matches: + bracket_count -= 1 + elif backtrack_count > 0: + backtrack_count -= 1 + # We've found our backup end location if the title doesn't reslove. + if backtrack_count == 0: + last_bracket = index + 1 + + elif c in ("'", '"'): + # Quote has started + if not quote: + # We'll assume we are now in a title. + # Brackets are quoted, so no need to match them (except for the final one). + ignore_matches = True + backtrack_count = bracket_count + bracket_count = 1 + start_quote = index + 1 + quote = c + # Secondary quote (in case the first doesn't resolve): [text](link'"title") + elif c != quote and not alt_quote: + start_alt_quote = index + 1 + alt_quote = c + # Update primary quote match + elif c == quote: + exit_quote = index + 1 + # Update secondary quote match + elif alt_quote and c == alt_quote: + exit_alt_quote = index + 1 + + index += 1 + + # Link is closed, so let's break out of the loop + if bracket_count == 0: + # Get the title if we closed a title string right before link closed + if exit_quote >= 0 and quote == last: + href = data[start_index:start_quote - 1] + title = ''.join(data[start_quote:exit_quote - 1]) + elif exit_alt_quote >= 0 and alt_quote == last: + href = data[start_index:start_alt_quote - 1] + title = ''.join(data[start_alt_quote:exit_alt_quote - 1]) + else: + href = data[start_index:index - 1] + break + + if c != ' ': + last = c + + # We have a scenario: [test](link"notitle) + # When we enter a string, we stop tracking bracket resolution in the main counter, + # but we do keep a backup counter up until we discover where we might resolve all brackets + # if the title string fails to resolve. + if bracket_count != 0 and backtrack_count == 0: + href = data[start_index:last_bracket - 1] + index = last_bracket + bracket_count = 0 + + handled = bracket_count == 0 + + if title is not None: + title = self.RE_TITLE_CLEAN.sub(' ', dequote(self.unescape(title.strip()))) + + href = self.unescape(href).strip() + + return href, title, index, handled + + def getText(self, data, index): + """Parse the content between `[]` of the start of an image or link + resolving nested square brackets. -class ImagePattern(LinkPattern): + """ + bracket_count = 1 + text = [] + for pos in util.iterrange(index, len(data)): + c = data[pos] + if c == ']': + bracket_count -= 1 + elif c == '[': + bracket_count += 1 + index += 1 + if bracket_count == 0: + break + text.append(c) + return ''.join(text), index, bracket_count == 0 + + +class ImageInlineProcessor(LinkInlineProcessor): """ Return a img element from the given match. """ - def handleMatch(self, m): + + def handleMatch(self, m, data): + text, index, handled = self.getText(data, m.end(0)) + if not handled: + return None, None, None + + src, title, index, handled = self.getLink(data, index) + if not handled: + return None, None, None + el = util.etree.Element("img") - src_parts = m.group(9).split() - if src_parts: - src = src_parts[0] - if src[0] == "<" and src[-1] == ">": - src = src[1:-1] - el.set('src', self.unescape(src)) - else: - el.set('src', "") - if len(src_parts) > 1: - el.set('title', dequote(self.unescape(" ".join(src_parts[1:])))) + + el.set("src", src) + + if title is not None: + el.set("title", title) if self.markdown.enable_attributes: - truealt = handleAttributes(m.group(2), el) + truealt = handleAttributes(text, el) else: - truealt = m.group(2) + truealt = text el.set('alt', self.unescape(truealt)) - return el + return el, m.start(0), index -class ReferencePattern(LinkPattern): +class ReferenceInlineProcessor(LinkInlineProcessor): """ Match to a stored reference and return link element. """ - NEWLINE_CLEANUP_RE = re.compile(r'[ ]?\n', re.MULTILINE) - def handleMatch(self, m): - try: - id = m.group(9).lower() - except IndexError: - id = None - if not id: - # if we got something like "[Google][]" or "[Google]" - # we'll use "google" as the id - id = m.group(2).lower() + RE_LINK = re.compile(r'\s?\[([^\]]*)\]', re.DOTALL | re.UNICODE) + + def handleMatch(self, m, data): + text, index, handled = self.getText(data, m.end(0)) + if not handled: + return None, None, None + + id, end, handled = self.evalId(data, index, text) + if not handled: + return None, None, None # Clean up linebreaks in id id = self.NEWLINE_CLEANUP_RE.sub(' ', id) if id not in self.markdown.references: # ignore undefined refs - return None + return None, m.start(0), end + href, title = self.markdown.references[id] - text = m.group(2) - return self.makeTag(href, title, text) + return self.makeTag(href, title, text), m.start(0), end + + def evalId(self, data, index, text): + """ + Evaluate the id portion of [ref][id]. + + If [ref][] use [ref]. + """ + m = self.RE_LINK.match(data, pos=index) + if not m: + return None, index, False + else: + id = m.group(1).lower() + end = m.end(0) + if not id: + id = text.lower() + return id, end, True def makeTag(self, href, title, text): el = util.etree.Element('a') @@ -418,7 +661,15 @@ class ReferencePattern(LinkPattern): return el -class ImageReferencePattern(ReferencePattern): +class ShortReferenceInlineProcessor(ReferenceInlineProcessor): + """Shorte form of reference: [google]. """ + def evalId(self, data, index, text): + """Evaluate the id from of [ref] """ + + return text.lower(), index, True + + +class ImageReferenceInlineProcessor(ReferenceInlineProcessor): """ Match to a stored reference and return img element. """ def makeTag(self, href, title, text): el = util.etree.Element("img") @@ -433,22 +684,22 @@ class ImageReferencePattern(ReferencePattern): return el -class AutolinkPattern(Pattern): +class AutolinkInlineProcessor(InlineProcessor): """ Return a link Element given an autolink (``). """ - def handleMatch(self, m): + def handleMatch(self, m, data): el = util.etree.Element("a") - el.set('href', self.unescape(m.group(2))) - el.text = util.AtomicString(m.group(2)) - return el + el.set('href', self.unescape(m.group(1))) + el.text = util.AtomicString(m.group(1)) + return el, m.start(0), m.end(0) -class AutomailPattern(Pattern): +class AutomailInlineProcessor(InlineProcessor): """ Return a mailto link Element given an automail link (``). """ - def handleMatch(self, m): + def handleMatch(self, m, data): el = util.etree.Element('a') - email = self.unescape(m.group(2)) + email = self.unescape(m.group(1)) if email.startswith("mailto:"): email = email[len("mailto:"):] @@ -467,4 +718,4 @@ class AutomailPattern(Pattern): mailto = "".join([util.AMP_SUBSTITUTE + '#%d;' % ord(letter) for letter in mailto]) el.set('href', mailto) - return el + return el, m.start(0), m.end(0) diff --git a/markdown/treeprocessors.py b/markdown/treeprocessors.py index e2566a4..7c37ae7 100644 --- a/markdown/treeprocessors.py +++ b/markdown/treeprocessors.py @@ -231,21 +231,38 @@ class InlineProcessor(Treeprocessor): Returns: String with placeholders instead of ElementTree elements. """ + new_style = isinstance(pattern, inlinepatterns.InlineProcessor) for exclude in pattern.ANCESTOR_EXCLUDES: if exclude.lower() in self.ancestors: return data, False, 0 - match = pattern.getCompiledRegExp().match(data[startIndex:]) - leftData = data[:startIndex] + if new_style: + match = None + # Since handleMatch may reject our first match, + # we iterate over the buffer looking for matches + # until we can't find any more. + for match in pattern.getCompiledRegExp().finditer(data, startIndex): + node, start, end = pattern.handleMatch(match, data) + if start is None or end is None: + startIndex += match.end(0) + match = None + continue + break + else: # pragma: no cover + match = pattern.getCompiledRegExp().match(data[startIndex:]) + leftData = data[:startIndex] if not match: return data, False, 0 - node = pattern.handleMatch(match) + if not new_style: # pragma: no cover + node = pattern.handleMatch(match) + start = match.start(0) + end = match.end(0) if node is None: - return data, True, len(leftData)+match.span(len(match.groups()))[0] + return data, True, end if not isString(node): if not isinstance(node.text, util.AtomicString): @@ -265,9 +282,13 @@ class InlineProcessor(Treeprocessor): placeholder = self.__stashNode(node, pattern.type()) - return "%s%s%s%s" % (leftData, - match.group(1), - placeholder, match.groups()[-1]), True, 0 + if new_style: + return "%s%s%s" % (data[:start], + placeholder, data[end:]), True, 0 + else: # pragma: no cover + return "%s%s%s%s" % (leftData, + match.group(1), + placeholder, match.groups()[-1]), True, 0 def __build_ancestors(self, parent, parents): """Build the ancestor list.""" diff --git a/markdown/util.py b/markdown/util.py index 8897195..3a36c00 100644 --- a/markdown/util.py +++ b/markdown/util.py @@ -14,10 +14,12 @@ if PY3: # pragma: no cover string_type = str text_type = str int2str = chr + iterrange = range else: # pragma: no cover string_type = basestring # noqa text_type = unicode # noqa int2str = unichr # noqa + iterrange = xrange # noqa """ diff --git a/tests/misc/image.html b/tests/misc/image.html deleted file mode 100644 index 1171e4e..0000000 --- a/tests/misc/image.html +++ /dev/null @@ -1,5 +0,0 @@ -

Poster

-

Poster

-

Blank

-

![Fail](http://humane man.jpg "The most humane man.")

-

![Fail](http://humane man.jpg)

\ No newline at end of file diff --git a/tests/misc/image.txt b/tests/misc/image.txt deleted file mode 100644 index 3fae16a..0000000 --- a/tests/misc/image.txt +++ /dev/null @@ -1,12 +0,0 @@ - -![Poster](http://humane_man.jpg "The most humane man.") - -![Poster][] - -[Poster]:http://humane_man.jpg "The most humane man." - -![Blank]() - -![Fail](http://humane man.jpg "The most humane man.") - -![Fail](http://humane man.jpg) diff --git a/tests/test_apis.py b/tests/test_apis.py index aa43e52..15ecc5b 100644 --- a/tests/test_apis.py +++ b/tests/test_apis.py @@ -753,16 +753,16 @@ class TestEscapeAppend(unittest.TestCase): class TestAncestorExclusion(unittest.TestCase): """ Tests exclusion of tags in ancestor list. """ - class AncestorExample(markdown.inlinepatterns.SimpleTagPattern): + class AncestorExample(markdown.inlinepatterns.SimpleTagInlineProcessor): """ Ancestor Test. """ ANCESTOR_EXCLUDES = ('a',) - def handleMatch(self, m): + def handleMatch(self, m, data): """ Handle match. """ el = markdown.util.etree.Element(self.tag) - el.text = m.group(3) - return el + el.text = m.group(2) + return el, m.start(0), m.end(0) class AncestorExtension(markdown.Extension): @@ -774,7 +774,7 @@ class TestAncestorExclusion(unittest.TestCase): def extendMarkdown(self, md, md_globals): """Modify inline patterns.""" - pattern = r'(\+)([^\+]+)\2' + pattern = r'(\+)([^\+]+)\1' md.inlinePatterns["ancestor-test"] = TestAncestorExclusion.AncestorExample(pattern, 'strong') def setUp(self): diff --git a/tests/test_syntax/inline/__init__.py b/tests/test_syntax/inline/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_syntax/inline/images.py b/tests/test_syntax/inline/images.py new file mode 100644 index 0000000..9c1dc34 --- /dev/null +++ b/tests/test_syntax/inline/images.py @@ -0,0 +1,139 @@ +from markdown.test_tools import TestCase + + +class TestAdvancedImages(TestCase): + + def test_nested_square_brackets(self): + self.assertMarkdownRenders( + """![Text[[[[[[[]]]]]]][]](http://link.com/image.png) more text""", + """

Text[[[[[[[]]]]]]][] more text

""" + ) + + def test_nested_round_brackets(self): + self.assertMarkdownRenders( + """![Text](http://link.com/(((((((()))))))()).png) more text""", + """

Text more text

""" + ) + + def test_uneven_brackets_with_titles1(self): + self.assertMarkdownRenders( + """![Text](http://link.com/(.png"title") more text""", + """

Text more text

""" + ) + + def test_uneven_brackets_with_titles2(self): + self.assertMarkdownRenders( + """![Text](http://link.com/('.png"title") more text""", + """

Text more text

""" + ) + + def test_uneven_brackets_with_titles3(self): + self.assertMarkdownRenders( + """![Text](http://link.com/(.png"title)") more text""", + """

Text more text

""" + ) + + def test_uneven_brackets_with_titles4(self): + self.assertMarkdownRenders( + """![Text](http://link.com/(.png "title") more text""", + """

Text more text

""" + ) + + def test_uneven_brackets_with_titles5(self): + self.assertMarkdownRenders( + """![Text](http://link.com/(.png "title)") more text""", + """

Text more text

""" + ) + + def test_mixed_title_quotes1(self): + self.assertMarkdownRenders( + """![Text](http://link.com/'.png"title") more text""", + """

Text more text

""" + ) + + def test_mixed_title_quotes2(self): + self.assertMarkdownRenders( + """![Text](http://link.com/".png'title') more text""", + """

Text more text

""" + ) + + def test_mixed_title_quotes3(self): + self.assertMarkdownRenders( + """![Text](http://link.com/with spaces.png'"and quotes" 'and title') more text""", + """

Text""" + """ more text

""" + ) + + def test_mixed_title_quotes4(self): + self.assertMarkdownRenders( + """![Text](http://link.com/with spaces'.png"and quotes" 'and title") more text""", + """

Text""" + """ more text

""" + ) + + def test_mixed_title_quotes5(self): + self.assertMarkdownRenders( + """![Text](http://link.com/with spaces .png'"and quotes" 'and title') more text""", + """

Text more text

""" + ) + + def test_mixed_title_quotes6(self): + self.assertMarkdownRenders( + """![Text](http://link.com/with spaces "and quotes".png 'and title') more text""", + """

Text""" + """ more text

""" + ) + + def test_single_quote(self): + self.assertMarkdownRenders( + """![test](link"notitle.png)""", + """

test

""" + ) + + def test_angle_with_mixed_title_quotes(self): + self.assertMarkdownRenders( + """![Text]( 'and title') more text""", + """

Text""" + """ more text

""" + ) + + def test_misc(self): + self.assertMarkdownRenders( + """![Poster](http://humane_man.jpg "The most humane man.")""", + """

Poster

""" + ) + + def test_misc_ref(self): + self.assertMarkdownRenders( + self.dedent( + """ + ![Poster][] + + [Poster]:http://humane_man.jpg "The most humane man." + """ + ), + self.dedent( + """ +

Poster

+ """ + ) + ) + + def test_misc_blank(self): + self.assertMarkdownRenders( + """![Blank]()""", + """

Blank

""" + ) + + def test_misc_img_title(self): + self.assertMarkdownRenders( + """![Image](http://humane man.jpg "The most humane man.")""", + """

Image

""" + ) + + def test_misc_img(self): + self.assertMarkdownRenders( + """![Image](http://humane man.jpg)""", + """

Image

""" + ) diff --git a/tests/test_syntax/inline/links.py b/tests/test_syntax/inline/links.py new file mode 100644 index 0000000..fe58ada --- /dev/null +++ b/tests/test_syntax/inline/links.py @@ -0,0 +1,98 @@ +from markdown.test_tools import TestCase + + +class TestAdvancedLinks(TestCase): + + def test_nested_square_brackets(self): + self.assertMarkdownRenders( + """[Text[[[[[[[]]]]]]][]](http://link.com) more text""", + """

Text[[[[[[[]]]]]]][] more text

""" + ) + + def test_nested_round_brackets(self): + self.assertMarkdownRenders( + """[Text](http://link.com/(((((((()))))))())) more text""", + """

Text more text

""" + ) + + def test_uneven_brackets_with_titles1(self): + self.assertMarkdownRenders( + """[Text](http://link.com/("title") more text""", + """

Text more text

""" + ) + + def test_uneven_brackets_with_titles2(self): + self.assertMarkdownRenders( + """[Text](http://link.com/('"title") more text""", + """

Text more text

""" + ) + + def test_uneven_brackets_with_titles3(self): + self.assertMarkdownRenders( + """[Text](http://link.com/("title)") more text""", + """

Text more text

""" + ) + + def test_uneven_brackets_with_titles4(self): + self.assertMarkdownRenders( + """[Text](http://link.com/( "title") more text""", + """

Text more text

""" + ) + + def test_uneven_brackets_with_titles5(self): + self.assertMarkdownRenders( + """[Text](http://link.com/( "title)") more text""", + """

Text more text

""" + ) + + def test_mixed_title_quotes1(self): + self.assertMarkdownRenders( + """[Text](http://link.com/'"title") more text""", + """

Text more text

""" + ) + + def test_mixed_title_quotes2(self): + self.assertMarkdownRenders( + """[Text](http://link.com/"'title') more text""", + """

Text more text

""" + ) + + def test_mixed_title_quotes3(self): + self.assertMarkdownRenders( + """[Text](http://link.com/with spaces'"and quotes" 'and title') more text""", + """

""" + """Text more text

""" + ) + + def test_mixed_title_quotes4(self): + self.assertMarkdownRenders( + """[Text](http://link.com/with spaces'"and quotes" 'and title") more text""", + """

Text more text

""" + ) + + def test_mixed_title_quotes5(self): + self.assertMarkdownRenders( + """[Text](http://link.com/with spaces '"and quotes" 'and title') more text""", + """

""" + """Text more text

""" + ) + + def test_mixed_title_quotes6(self): + self.assertMarkdownRenders( + """[Text](http://link.com/with spaces "and quotes" 'and title') more text""", + """

""" + """Text more text

""" + ) + + def test_single_quote(self): + self.assertMarkdownRenders( + """[test](link"notitle)""", + """

test

""" + ) + + def test_angle_with_mixed_title_quotes(self): + self.assertMarkdownRenders( + """[Text]( 'and title') more text""", + """

""" + """Text more text

""" + ) -- cgit v1.2.3