3 files changed, 96 insertions, 6 deletions
diff --git a/markdown.py b/markdown.py
index b25cadd..37c5478 100644
--- a/markdown.py
+++ b/markdown.py
@@ -34,6 +34,7 @@ __revision__ = "$Rev$"
 
 
 import re, sys, codecs
+from urlparse import urlparse
 
 from logging import getLogger, StreamHandler, Formatter, \
                     DEBUG, INFO, WARN, ERROR, CRITICAL
@@ -806,6 +807,9 @@ class Pattern:
         self.pattern = pattern
         self.compiled_re = re.compile("^(.*?)%s(.*?)$" % pattern, re.DOTALL)
 
+        # Api for Markdown to pass safe_mode into instance
+        self.safe_mode = False
+
     def getCompiledRegExp (self):
         """ Return a compiled regular expression. """
         return self.compiled_re
@@ -889,6 +893,7 @@ class HtmlPattern (Pattern):
         return doc.createTextNode(place_holder)
 
 
+
 class LinkPattern (Pattern):
     """ Return a NanoDom link Element from the given match. """
     def handleMatch(self, m, doc):
@@ -897,7 +902,7 @@ class LinkPattern (Pattern):
         parts = m.group(9).split('"')
         # We should now have [], [href], or [href, title]
         if parts:
-            el.setAttribute('href', parts[0].strip())
+            el.setAttribute('href', self.sanatize_url(parts[0].strip()))
         else:
             el.setAttribute('href', "")
         if len(parts) > 1:
@@ -907,14 +912,45 @@ class LinkPattern (Pattern):
             el.setAttribute('title', title)
         return el
 
+    def sanatize_url(self, url):
+        """ 
+        Sanitize a url against xss attacks in "safe_mode".
+
+        Rather than specifically blacklisting `javascript:alert("XSS")` and all
+        its aliases (see <http://ha.ckers.org/xss.html>), we whitelist known
+        safe url formats. Most urls contain a network location, however some 
+        are known not to (i.e.: mailto links). Script urls do not contain a 
+        location. Additionally, for `javascript:...`, the scheme would be 
+        "javascript" but some aliases will appear to `urlparse()` to have no 
+        scheme. On top of that relative links (i.e.: "foo/bar.html") have no 
+        scheme. Therefore we must check "path", "parameters", "query" and 
+        "fragment" for any literal colons. We don't check "scheme" for colons 
+        because it *should* never have any and "netloc" must allow the form:
+        `username:password@host:port`.
+        
+        """
+        locless_schemes = ['', 'mailto', 'news']
+        url = urlparse(url)
+        safe_url = False
+        if url.netloc != '' or url.scheme in locless_schemes:
+            safe_url = True
+
+        for part in url[2:]:
+            if ":" in part:
+                safe_url = False
+
+        if self.safe_mode and not safe_url:
+            return ''
+        else:
+            return url.geturl()
 
-class ImagePattern (Pattern):
+class ImagePattern(LinkPattern):
     """ Return a NanoDom img Element from the given match. """
     def handleMatch(self, m, doc):
         el = doc.createElement('img')
         src_parts = m.group(9).split()
         if src_parts:
-            el.setAttribute('src', src_parts[0])
+            el.setAttribute('src', self.sanatize_url(src_parts[0]))
         else:
             el.setAttribute('src', "")
         if len(src_parts) > 1:
@@ -930,7 +966,7 @@ class ImagePattern (Pattern):
         el.setAttribute('alt', truealt)
         return el
 
-class ReferencePattern (Pattern):
+class ReferencePattern(LinkPattern):
     """ Match to a stored reference and return a NanoDom link Element. """
     def handleMatch(self, m, doc):
 
@@ -949,7 +985,7 @@ class ReferencePattern (Pattern):
 
     def makeTag(self, href, title, text, doc):
         el = doc.createElement('a')
-        el.setAttribute('href', href)
+        el.setAttribute('href', self.sanatize_url(href))
         if title:
             el.setAttribute('title', title)
         el.appendChild(doc.createTextNode(text))
@@ -960,7 +996,7 @@ class ImageReferencePattern (ReferencePattern):
     """ Match to a stored reference and return a NanoDom img Element. """
     def makeTag(self, href, title, text, doc):
         el = doc.createElement('img')
-        el.setAttribute('src', href)
+        el.setAttribute('src', self.sanatize_url(href))
         if title:
             el.setAttribute('title', title)
         el.setAttribute('alt', text)
@@ -1416,6 +1452,9 @@ class Markdown:
         for extension in self.registeredExtensions:
             extension.reset()
 
+        for pattern in self.inlinePatterns:
+            pattern.safe_mode = self.safeMode
+
 
     def _transform(self):
         """Transform the Markdown text into a XHTML body document.
diff --git a/tests/safe_mode/unsafe_urls.html b/tests/safe_mode/unsafe_urls.html
new file mode 100644
index 0000000..8eda30d
--- /dev/null
+++ b/tests/safe_mode/unsafe_urls.html
@@ -0,0 +1,24 @@
+<p>These links should be unsafe and not allowed in safe_mode
+</p>
+<p><a href="">link</a>
+   <a href="">link</a>
+   <a href="">link</a>
+   <a href="">link</a>
+   <a href="">link</a>
+   <a href="">link</a>
+   <a href="">link</a>
+   <a href="">link</a>
+   <a href="">link</a>
+   <a href="">link</a>
+   <a href="">link</a>
+</p>
+<p><img src="" alt="img"/><a href="">ref</a>
+   <img src="" alt="imgref"/>
+</p>
+<p>These should work regardless:
+</p>
+<p><a href="relative/url.html">relative</a>
+   <a href="mailto:foo@bar.com">email</a>
+   <a href="news:some.news.group.com">news scheme</a>
+   <a href="http://example.com">http link</a>
+</p>
+\ No newline at end of file
diff --git a/tests/safe_mode/unsafe_urls.txt b/tests/safe_mode/unsafe_urls.txt
new file mode 100644
index 0000000..7bfd81d
--- /dev/null
+++ b/tests/safe_mode/unsafe_urls.txt
@@ -0,0 +1,27 @@
+These links should be unsafe and not allowed in safe_mode
+
+[link](javascript:alert%28'Hello%20world!'%29)
+[link](vbscript:msgbox%28%22Hello%20world!%22%29)
+[link](livescript:alert%28'Hello%20world!'%29)
+[link](mocha:[code])
+[link](jAvAsCrIpT:alert%28'Hello%20world!'%29)
+[link](ja&#32;vas&#32;cr&#32;ipt:alert%28'Hello%20world!'%29)
+[link](ja&#00032;vas&#32;cr&#32;ipt:alert%28'Hello%20world!'%29)
+[link](ja&#x00020;vas&#32;cr&#32;ipt:alert%28'Hello%20world!'%29)
+[link](ja%09&#x20;%0Avas&#32;cr&#x0a;ipt:alert%28'Hello%20world!'%29)
+[link](ja%20vas%20cr%20ipt:alert%28'Hello%20world!'%29)
+[link](live%20script:alert%28'Hello%20world!'%29)
+
+![img](javascript:alert%29'XSS'%29)
+[ref][]
+![imgref][]
+
+[ref]: javascript:alert%29'XSS'%29
+[imgref]: javascript:alert%29'XSS'%29
+
+These should work regardless:
+
+[relative](relative/url.html)
+[email](mailto:foo@bar.com)
+[news scheme](news:some.news.group.com)
+[http link](http://example.com)