diff options
Diffstat (limited to 'markdown.py')
-rw-r--r-- | markdown.py | 26 |
1 files changed, 20 insertions, 6 deletions
diff --git a/markdown.py b/markdown.py index c57ef47..4d1eeaf 100644 --- a/markdown.py +++ b/markdown.py @@ -84,7 +84,12 @@ Importantly, NanoDom does not do normalization, which is what we want. It also adds extra white space when converting DOM to string """ -ENTITY_NORMALIZATION_EXPRESSIONS = [ (re.compile("&(?!\#)"), "&"), +ENTITY_NORMALIZATION_EXPRESSIONS = [ (re.compile("&"), "&"), + (re.compile("<"), "<"), + (re.compile(">"), ">"), + (re.compile("\""), """)] + +ENTITY_NORMALIZATION_EXPRESSIONS_SOFT = [ (re.compile("&(?!\#)"), "&"), (re.compile("<"), "<"), (re.compile(">"), ">"), (re.compile("\""), """)] @@ -122,9 +127,14 @@ class Document : def toxml (self) : return self.documentElement.toxml() - def normalizeEntities(self, text) : + def normalizeEntities(self, text, avoidDoubleNormalizing=False) : - for regexp, substitution in ENTITY_NORMALIZATION_EXPRESSIONS : + if avoidDoubleNormalizing : + regexps = ENTITY_NORMALIZATION_EXPRESSIONS_SOFT + else : + regexps = ENTITY_NORMALIZATION_EXPRESSIONS + + for regexp, substitution in regexps : text = regexp.sub(substitution, text) return text @@ -213,7 +223,7 @@ class Element : buffer += "<" + self.nodeName for attr in self.attributes : value = self.attribute_values[attr] - value = self.doc.normalizeEntities(value) + value = self.doc.normalizeEntities(value, avoidDoubleNormalizing=True) buffer += ' %s="%s"' % (attr, value) if self.childNodes or self.nodeName in ['blockquote']: buffer += ">" @@ -399,6 +409,7 @@ class HtmlBlockPreprocessor (Preprocessor): def run (self, lines) : + new_blocks = [] text = "\n".join(lines) text = text.split("\n\n") @@ -635,12 +646,12 @@ class LinkPattern (Pattern): parts = m.group(9).split('"') # We should now have [], [href], or [href, title] if parts : - el.setAttribute('href', parts[0]) + el.setAttribute('href', parts[0].strip()) else : el.setAttribute('href', "") if len(parts) > 1 : # we also got a title - title = " ".join(parts[1:]).strip() + title = '"' + '"'.join(parts[1:]).strip() title = dequote(title) #.replace('"', """) el.setAttribute('title', title) return el @@ -1388,6 +1399,9 @@ class Markdown: # (ideally this should be recursive. # here we only go one level deep) + if x.nodeName in ["code", "pre"] : + break + j = 0 while j < len(x.childNodes): child = x.childNodes[j] |