diff options
-rw-r--r-- | markdown.py | 51 | ||||
-rw-r--r-- | tests/safe_mode/unsafe_urls.html | 24 | ||||
-rw-r--r-- | tests/safe_mode/unsafe_urls.txt | 27 |
3 files changed, 96 insertions, 6 deletions
diff --git a/markdown.py b/markdown.py index b25cadd..37c5478 100644 --- a/markdown.py +++ b/markdown.py @@ -34,6 +34,7 @@ __revision__ = "$Rev$" import re, sys, codecs +from urlparse import urlparse from logging import getLogger, StreamHandler, Formatter, \ DEBUG, INFO, WARN, ERROR, CRITICAL @@ -806,6 +807,9 @@ class Pattern: self.pattern = pattern self.compiled_re = re.compile("^(.*?)%s(.*?)$" % pattern, re.DOTALL) + # Api for Markdown to pass safe_mode into instance + self.safe_mode = False + def getCompiledRegExp (self): """ Return a compiled regular expression. """ return self.compiled_re @@ -889,6 +893,7 @@ class HtmlPattern (Pattern): return doc.createTextNode(place_holder) + class LinkPattern (Pattern): """ Return a NanoDom link Element from the given match. """ def handleMatch(self, m, doc): @@ -897,7 +902,7 @@ class LinkPattern (Pattern): parts = m.group(9).split('"') # We should now have [], [href], or [href, title] if parts: - el.setAttribute('href', parts[0].strip()) + el.setAttribute('href', self.sanatize_url(parts[0].strip())) else: el.setAttribute('href', "") if len(parts) > 1: @@ -907,14 +912,45 @@ class LinkPattern (Pattern): el.setAttribute('title', title) return el + def sanatize_url(self, url): + """ + Sanitize a url against xss attacks in "safe_mode". + + Rather than specifically blacklisting `javascript:alert("XSS")` and all + its aliases (see <http://ha.ckers.org/xss.html>), we whitelist known + safe url formats. Most urls contain a network location, however some + are known not to (i.e.: mailto links). Script urls do not contain a + location. Additionally, for `javascript:...`, the scheme would be + "javascript" but some aliases will appear to `urlparse()` to have no + scheme. On top of that relative links (i.e.: "foo/bar.html") have no + scheme. Therefore we must check "path", "parameters", "query" and + "fragment" for any literal colons. We don't check "scheme" for colons + because it *should* never have any and "netloc" must allow the form: + `username:password@host:port`. + + """ + locless_schemes = ['', 'mailto', 'news'] + url = urlparse(url) + safe_url = False + if url.netloc != '' or url.scheme in locless_schemes: + safe_url = True + + for part in url[2:]: + if ":" in part: + safe_url = False + + if self.safe_mode and not safe_url: + return '' + else: + return url.geturl() -class ImagePattern (Pattern): +class ImagePattern(LinkPattern): """ Return a NanoDom img Element from the given match. """ def handleMatch(self, m, doc): el = doc.createElement('img') src_parts = m.group(9).split() if src_parts: - el.setAttribute('src', src_parts[0]) + el.setAttribute('src', self.sanatize_url(src_parts[0])) else: el.setAttribute('src', "") if len(src_parts) > 1: @@ -930,7 +966,7 @@ class ImagePattern (Pattern): el.setAttribute('alt', truealt) return el -class ReferencePattern (Pattern): +class ReferencePattern(LinkPattern): """ Match to a stored reference and return a NanoDom link Element. """ def handleMatch(self, m, doc): @@ -949,7 +985,7 @@ class ReferencePattern (Pattern): def makeTag(self, href, title, text, doc): el = doc.createElement('a') - el.setAttribute('href', href) + el.setAttribute('href', self.sanatize_url(href)) if title: el.setAttribute('title', title) el.appendChild(doc.createTextNode(text)) @@ -960,7 +996,7 @@ class ImageReferencePattern (ReferencePattern): """ Match to a stored reference and return a NanoDom img Element. """ def makeTag(self, href, title, text, doc): el = doc.createElement('img') - el.setAttribute('src', href) + el.setAttribute('src', self.sanatize_url(href)) if title: el.setAttribute('title', title) el.setAttribute('alt', text) @@ -1416,6 +1452,9 @@ class Markdown: for extension in self.registeredExtensions: extension.reset() + for pattern in self.inlinePatterns: + pattern.safe_mode = self.safeMode + def _transform(self): """Transform the Markdown text into a XHTML body document. diff --git a/tests/safe_mode/unsafe_urls.html b/tests/safe_mode/unsafe_urls.html new file mode 100644 index 0000000..8eda30d --- /dev/null +++ b/tests/safe_mode/unsafe_urls.html @@ -0,0 +1,24 @@ +<p>These links should be unsafe and not allowed in safe_mode +</p> +<p><a href="">link</a> + <a href="">link</a> + <a href="">link</a> + <a href="">link</a> + <a href="">link</a> + <a href="">link</a> + <a href="">link</a> + <a href="">link</a> + <a href="">link</a> + <a href="">link</a> + <a href="">link</a> +</p> +<p><img src="" alt="img"/><a href="">ref</a> + <img src="" alt="imgref"/> +</p> +<p>These should work regardless: +</p> +<p><a href="relative/url.html">relative</a> + <a href="mailto:foo@bar.com">email</a> + <a href="news:some.news.group.com">news scheme</a> + <a href="http://example.com">http link</a> +</p>
\ No newline at end of file diff --git a/tests/safe_mode/unsafe_urls.txt b/tests/safe_mode/unsafe_urls.txt new file mode 100644 index 0000000..7bfd81d --- /dev/null +++ b/tests/safe_mode/unsafe_urls.txt @@ -0,0 +1,27 @@ +These links should be unsafe and not allowed in safe_mode + +[link](javascript:alert%28'Hello%20world!'%29) +[link](vbscript:msgbox%28%22Hello%20world!%22%29) +[link](livescript:alert%28'Hello%20world!'%29) +[link](mocha:[code]) +[link](jAvAsCrIpT:alert%28'Hello%20world!'%29) +[link](ja vas cr ipt:alert%28'Hello%20world!'%29) +[link](ja vas cr ipt:alert%28'Hello%20world!'%29) +[link](ja vas cr ipt:alert%28'Hello%20world!'%29) +[link](ja%09 %0Avas cr
ipt:alert%28'Hello%20world!'%29) +[link](ja%20vas%20cr%20ipt:alert%28'Hello%20world!'%29) +[link](live%20script:alert%28'Hello%20world!'%29) + +![img](javascript:alert%29'XSS'%29) +[ref][] +![imgref][] + +[ref]: javascript:alert%29'XSS'%29 +[imgref]: javascript:alert%29'XSS'%29 + +These should work regardless: + +[relative](relative/url.html) +[email](mailto:foo@bar.com) +[news scheme](news:some.news.group.com) +[http link](http://example.com) |