aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorIsaac Muse <faceless.shop@gmail.com>2018-07-29 12:44:18 -0600
committerWaylan Limberg <waylan.limberg@icloud.com>2018-07-29 14:44:18 -0400
commit59406c41e7c3548d1c95a2091e2d676323494f62 (patch)
tree48eabd91c037842674cb2af0454a90de97eb6f1b
parent0081cb8519ebda441b129462e8eb6c0f6c7d30a4 (diff)
downloadmarkdown-59406c41e7c3548d1c95a2091e2d676323494f62.tar.gz
markdown-59406c41e7c3548d1c95a2091e2d676323494f62.tar.bz2
markdown-59406c41e7c3548d1c95a2091e2d676323494f62.zip
Fix double escaping of amp in attributes (#670)
Serializer should only escape & in attributes if not part of &amp; Better regex avoid Unicode and `_` in amp detection. In general, we don't want to escape already escaped content, but with code content, we want literal representations of escaped content, so have code content explicitly escape its content before placing in AtomicStrings. Closes #669.
-rw-r--r--markdown/blockprocessors.py4
-rw-r--r--markdown/inlinepatterns.py4
-rw-r--r--markdown/serializers.py11
-rw-r--r--markdown/util.py11
-rw-r--r--tests/misc/amp-in-url.html1
-rw-r--r--tests/misc/amp-in-url.txt1
-rw-r--r--tests/test_apis.py9
-rw-r--r--tests/test_syntax/inline/test_images.py (renamed from tests/test_syntax/inline/images.py)0
-rw-r--r--tests/test_syntax/inline/test_links.py (renamed from tests/test_syntax/inline/links.py)12
9 files changed, 44 insertions, 9 deletions
diff --git a/markdown/blockprocessors.py b/markdown/blockprocessors.py
index d2c9cd3..378c7c7 100644
--- a/markdown/blockprocessors.py
+++ b/markdown/blockprocessors.py
@@ -259,14 +259,14 @@ class CodeBlockProcessor(BlockProcessor):
code = sibling[0]
block, theRest = self.detab(block)
code.text = util.AtomicString(
- '%s\n%s\n' % (code.text, block.rstrip())
+ '%s\n%s\n' % (code.text, util.code_escape(block.rstrip()))
)
else:
# This is a new codeblock. Create the elements and insert text.
pre = util.etree.SubElement(parent, 'pre')
code = util.etree.SubElement(pre, 'code')
block, theRest = self.detab(block)
- code.text = util.AtomicString('%s\n' % block.rstrip())
+ code.text = util.AtomicString('%s\n' % util.code_escape(block.rstrip()))
if theRest:
# This block contained unindented line(s) after the first indented
# line. Insert these lines as the first block of the master blocks
diff --git a/markdown/inlinepatterns.py b/markdown/inlinepatterns.py
index 83edf4b..8d49d07 100644
--- a/markdown/inlinepatterns.py
+++ b/markdown/inlinepatterns.py
@@ -158,7 +158,7 @@ AUTOMAIL_RE = r'<([^> \!]*@[^> ]*)>'
HTML_RE = r'(\<([a-zA-Z/][^\>]*?|\!--.*?--)\>)'
# &amp;
-ENTITY_RE = r'(&[\#a-zA-Z0-9]*;)'
+ENTITY_RE = r'(&(?:\#[0-9]+|[a-zA-Z0-9]+);)'
# two spaces at end of line
LINE_BREAK_RE = r' \n'
@@ -369,7 +369,7 @@ class BacktickInlineProcessor(InlineProcessor):
def handleMatch(self, m, data):
if m.group(3):
el = util.etree.Element(self.tag)
- el.text = util.AtomicString(m.group(3).strip())
+ el.text = util.AtomicString(util.code_escape(m.group(3).strip()))
return el, m.start(0), m.end(0)
else:
return m.group(1).replace('\\\\', self.ESCAPED_BSLASH), m.start(0), m.end(0)
diff --git a/markdown/serializers.py b/markdown/serializers.py
index 308cf7a..3cfa6bb 100644
--- a/markdown/serializers.py
+++ b/markdown/serializers.py
@@ -41,6 +41,7 @@ from __future__ import absolute_import
from __future__ import unicode_literals
from xml.etree.ElementTree import ProcessingInstruction
from . import util
+import re
ElementTree = util.etree.ElementTree
QName = util.etree.QName
if hasattr(util.etree, 'test_comment'): # pragma: no cover
@@ -52,6 +53,7 @@ __all__ = ['to_html_string', 'to_xhtml_string']
HTML_EMPTY = ("area", "base", "basefont", "br", "col", "frame", "hr",
"img", "input", "isindex", "link", "meta", "param")
+RE_AMP = re.compile(r'&(?!(?:\#[0-9]+|[0-9a-z]+);)', re.I)
try:
HTML_EMPTY = set(HTML_EMPTY)
@@ -72,7 +74,8 @@ def _escape_cdata(text):
# shorter than 500 character, or so. assume that's, by far,
# the most common case in most applications.
if "&" in text:
- text = text.replace("&", "&amp;")
+ # Only replace & when not part of an entity
+ text = RE_AMP.sub('&amp;', text)
if "<" in text:
text = text.replace("<", "&lt;")
if ">" in text:
@@ -86,7 +89,8 @@ def _escape_attrib(text):
# escape attribute value
try:
if "&" in text:
- text = text.replace("&", "&amp;")
+ # Only replace & when not part of an entity
+ text = RE_AMP.sub('&amp;', text)
if "<" in text:
text = text.replace("<", "&lt;")
if ">" in text:
@@ -104,7 +108,8 @@ def _escape_attrib_html(text):
# escape attribute value
try:
if "&" in text:
- text = text.replace("&", "&amp;")
+ # Only replace & when not part of an entity
+ text = RE_AMP.sub('&amp;', text)
if "<" in text:
text = text.replace("<", "&lt;")
if ">" in text:
diff --git a/markdown/util.py b/markdown/util.py
index aeb7818..b40c010 100644
--- a/markdown/util.py
+++ b/markdown/util.py
@@ -140,6 +140,17 @@ def parseBoolValue(value, fail_on_errors=True, preserve_none=False):
raise ValueError('Cannot parse bool value: %r' % value)
+def code_escape(text):
+ """Escape code."""
+ if "&" in text:
+ text = text.replace("&", "&amp;")
+ if "<" in text:
+ text = text.replace("<", "&lt;")
+ if ">" in text:
+ text = text.replace(">", "&gt;")
+ return text
+
+
def deprecated(message):
"""
Raise a DeprecationWarning when wrapped function/method is called.
diff --git a/tests/misc/amp-in-url.html b/tests/misc/amp-in-url.html
deleted file mode 100644
index 2170a54..0000000
--- a/tests/misc/amp-in-url.html
+++ /dev/null
@@ -1 +0,0 @@
-<p><a href="http://www.freewisdom.org/this&amp;that">link</a></p> \ No newline at end of file
diff --git a/tests/misc/amp-in-url.txt b/tests/misc/amp-in-url.txt
deleted file mode 100644
index 471106e..0000000
--- a/tests/misc/amp-in-url.txt
+++ /dev/null
@@ -1 +0,0 @@
-[link](http://www.freewisdom.org/this&that)
diff --git a/tests/test_apis.py b/tests/test_apis.py
index d9d520d..c813e56 100644
--- a/tests/test_apis.py
+++ b/tests/test_apis.py
@@ -666,6 +666,15 @@ class testSerializers(unittest.TestCase):
'<div xmlns="&lt;&amp;&quot;test&#10;escaping&quot;&gt;"></div>'
)
+ def testQNamePreEscaping(self):
+ """ Test QName that is already partially escaped. """
+ qname = markdown.util.etree.QName('&lt;&amp;"test&#10;escaping"&gt;', 'div')
+ el = markdown.util.etree.Element(qname)
+ self.assertEqual(
+ markdown.serializers.to_xhtml_string(el),
+ '<div xmlns="&lt;&amp;&quot;test&#10;escaping&quot;&gt;"></div>'
+ )
+
def buildExtension(self):
""" Build an extension which registers fakeSerializer. """
def fakeSerializer(elem):
diff --git a/tests/test_syntax/inline/images.py b/tests/test_syntax/inline/test_images.py
index 52ce330..52ce330 100644
--- a/tests/test_syntax/inline/images.py
+++ b/tests/test_syntax/inline/test_images.py
diff --git a/tests/test_syntax/inline/links.py b/tests/test_syntax/inline/test_links.py
index d74bb75..3e8593f 100644
--- a/tests/test_syntax/inline/links.py
+++ b/tests/test_syntax/inline/test_links.py
@@ -118,3 +118,15 @@ class TestAdvancedLinks(TestCase):
"""<p><a href="http://link.com/with spaces '&quot;and quotes&quot;" title="and title">"""
"""Text</a> more text</p>"""
)
+
+ def test_amp_in_url(self):
+ """Test amp in URLs."""
+
+ self.assertMarkdownRenders(
+ '[link](http://www.freewisdom.org/this&that)',
+ '<p><a href="http://www.freewisdom.org/this&amp;that">link</a></p>'
+ )
+ self.assertMarkdownRenders(
+ '[title](http://example.com/?a=1&amp;b=2)',
+ '<p><a href="http://example.com/?a=1&amp;b=2">title</a></p>'
+ )