%s\n
" % (HTML_PLACEHOLDER % i), html + "\n") text = text.replace(HTML_PLACEHOLDER % i, html) return text def escape(self, html): ''' Basic html escaping ''' html = html.replace('&', '&') html = html.replace('<', '<') html = html.replace('>', '>') return html.replace('"', '"') RAWHTMLTEXTPOSTPROCESSOR = RawHtmlTextPostprocessor() """ ====================================================================== ========================== MISC AUXILIARY CLASSES ==================== ====================================================================== """ class HtmlStash: """ This class is used for stashing HTML objects that we extract in the beginning and replace with place-holders. """ def __init__ (self): self.html_counter = 0 # for counting inline html segments self.rawHtmlBlocks=[] def store(self, html, safe=False): """ Saves an HTML segment for later reinsertion. Returns a placeholder string that needs to be inserted into the document. Keyword arguments: * html: an html segment * safe: label an html segment as safe for safemode Returns : a placeholder string """ self.rawHtmlBlocks.append((html, safe)) placeholder = HTML_PLACEHOLDER % self.html_counter self.html_counter += 1 return placeholder class BlockGuru: def _findHead(self, lines, fn, allowBlank=0): """ Functional magic to help determine boundaries of indented blocks. Keyword arguments: * lines: an array of strings * fn: a function that returns a substring of a string if the string matches the necessary criteria * allowBlank: specifies whether it's ok to have blank lines between matching functions Returns: a list of post processes items and the unused remainder of the original list """ items = [] item = -1 i = 0 # to keep track of where we are for line in lines: if not line.strip() and not allowBlank: return items, lines[i:] if not line.strip() and allowBlank: # If we see a blank line, this _might_ be the end i += 1 # Find the next non-blank line for j in range(i, len(lines)): if lines[j].strip(): next = lines[j] break else: # There is no more text => this is the end break # Check if the next non-blank line is still a part of the list part = fn(next) if part: items.append("") continue else: break # found end of the list part = fn(line) if part: items.append(part) i += 1 continue else: return items, lines[i:] else: i += 1 return items, lines[i:] def detabbed_fn(self, line): """ An auxiliary method to be passed to _findHead """ m = RE.regExp['tabbed'].match(line) if m: return m.group(4) else: return None def detectTabbed(self, lines): return self._findHead(lines, self.detabbed_fn, allowBlank = 1) def print_error(string): """Print an error string to stderr""" sys.stderr.write(string +'\n') def dequote(string): """ Removes quotes from around a string """ if ( ( string.startswith('"') and string.endswith('"')) or (string.startswith("'") and string.endswith("'")) ): return string[1:-1] else: return string """ ====================================================================== ========================== CORE MARKDOWN ============================= ====================================================================== This stuff is ugly, so if you are thinking of extending the syntax, see first if you can do it via pre-processors, post-processors, inline patterns or a combination of the three. """ class CorePatterns: """ This class is scheduled for removal as part of a refactoring effort. """ patterns = { 'header': r'(#*)([^#]*)(#*)', # # A title 'reference-def': r'(\ ?\ ?\ ?)\[([^\]]*)\]:\s*([^ ]*)(.*)', # [Google]: http://www.google.com/ 'containsline': r'([-]*)$|^([=]*)', # -----, =====, etc. 'ol': r'[ ]{0,3}[\d]*\.\s+(.*)', # 1. text 'ul': r'[ ]{0,3}[*+-]\s+(.*)', # "* text" 'isline1': r'(\**)', # *** 'isline2': r'(\-*)', # --- 'isline3': r'(\_*)', # ___ 'tabbed': r'((\t)|( ))(.*)', # an indented line 'quoted': r'[ ]{0,2}> ?(.*)', # a quoted block ("> ...") } def __init__ (self): self.regExp = {} for key in self.patterns.keys(): self.regExp[key] = re.compile("^%s$" % self.patterns[key], re.DOTALL) self.regExp['containsline'] = re.compile(r'^([-]*)$|^([=]*)$', re.M) RE = CorePatterns() class Markdown: """ Markdown formatter class for creating an html document from Markdown text. """ def __init__(self, source=None, # depreciated extensions=[], extension_configs={}, safe_mode = False): """ Creates a new Markdown instance. Keyword arguments: * source: The text in Markdown format. Depreciated! * extensions: A list of extensions. If they are of type string, the module mdx_name.py will be loaded. If they are a subclass of markdown.Extension, they will be used as-is. * extension-configs: Configuration setting for extensions. * safe_mode: Disallow raw html. """ self.source = source if source is not None: message(WARN, "The `source` arg of Markdown.__init__() is depreciated and will be removed in the future. Use `instance.convert(source)` instead.") self.safeMode = safe_mode self.blockGuru = BlockGuru() self.registeredExtensions = [] self.stripTopLevelTags = 1 self.docType = "" self.textPreprocessors = [HTML_BLOCK_PREPROCESSOR] self.preprocessors = [HEADER_PREPROCESSOR, LINE_PREPROCESSOR, # A footnote preprocessor will # get inserted here REFERENCE_PREPROCESSOR] self.postprocessors = [] # a footnote postprocessor will get # inserted later self.textPostprocessors = [# a footnote postprocessor will get # inserted here RAWHTMLTEXTPOSTPROCESSOR] self.prePatterns = [] self.inlinePatterns = [DOUBLE_BACKTICK_PATTERN, BACKTICK_PATTERN, ESCAPE_PATTERN, REFERENCE_PATTERN, LINK_ANGLED_PATTERN, LINK_PATTERN, IMAGE_LINK_PATTERN, IMAGE_REFERENCE_PATTERN, AUTOLINK_PATTERN, AUTOMAIL_PATTERN, LINE_BREAK_PATTERN_2, LINE_BREAK_PATTERN, HTML_PATTERN, ENTITY_PATTERN, NOT_STRONG_PATTERN, STRONG_EM_PATTERN, STRONG_EM_PATTERN_2, STRONG_PATTERN, STRONG_PATTERN_2, EMPHASIS_PATTERN, EMPHASIS_PATTERN_2 # The order of the handlers matters!!! ] self.registerExtensions(extensions = extensions, configs = extension_configs) self.reset() def registerExtensions(self, extensions, configs): """ Register extensions with this instance of Markdown. Keyword aurguments: * extensions: A list of extensions, which can either be strings or objects. See the docstring on Markdown. * configs: A dictionary mapping module names to config options. """ for ext in extensions: if isinstance(ext, basestring): ext = load_extension(ext, configs.get(ext, [])) elif hasattr(ext, 'extendMarkdown'): # Looks like an Extension. # Nothing to do here. pass else: message(ERROR, "Incorrect type! Extension '%s' is " "neither a string or an Extension." %(repr(ext))) continue ext.extendMarkdown(self, globals()) def registerExtension(self, extension): """ This gets called by the extension """ self.registeredExtensions.append(extension) def reset(self): """ Resets all state variables so that we can start with a new text. """ self.references={} self.htmlStash = HtmlStash() HTML_BLOCK_PREPROCESSOR.stash = self.htmlStash LINE_PREPROCESSOR.stash = self.htmlStash REFERENCE_PREPROCESSOR.references = self.references HTML_PATTERN.stash = self.htmlStash ENTITY_PATTERN.stash = self.htmlStash REFERENCE_PATTERN.references = self.references IMAGE_REFERENCE_PATTERN.references = self.references RAWHTMLTEXTPOSTPROCESSOR.stash = self.htmlStash RAWHTMLTEXTPOSTPROCESSOR.safeMode = self.safeMode for extension in self.registeredExtensions: extension.reset() def _transform(self): """Transforms the Markdown text into a XHTML body document Returns: A NanoDom Document """ # Setup the document self.doc = Document() self.top_element = self.doc.createElement("span") self.top_element.appendChild(self.doc.createTextNode('\n')) self.top_element.setAttribute('class', 'markdown') self.doc.appendChild(self.top_element) # Split into lines and run the preprocessors that will work with # self.lines self.lines = self.source.split("\n") # Run the pre-processors on the lines for prep in self.preprocessors : self.lines = prep.run(self.lines) # Create a NanoDom tree from the lines and attach it to Document buffer = [] for line in self.lines: if line.startswith("#"): self._processSection(self.top_element, buffer) buffer = [line] else: buffer.append(line) self._processSection(self.top_element, buffer) #self._processSection(self.top_element, self.lines) # Not sure why I put this in but let's leave it for now. self.top_element.appendChild(self.doc.createTextNode('\n')) # Run the post-processors for postprocessor in self.postprocessors: postprocessor.run(self.doc) return self.doc def _processSection(self, parent_elem, lines, inList = 0, looseList = 0): """ Process a section of a source document, looking for high level structural elements like lists, block quotes, code segments, html blocks, etc. Some those then get stripped of their high level markup (e.g. get unindented) and the lower-level markup is processed recursively. Keyword arguments: * parent_elem: A NanoDom element to which the content will be added. * lines: a list of lines * inList: a level Returns: None """ # Loop through lines until none left. while lines: # Check if this section starts with a list, a blockquote or # a code block processFn = { 'ul': self._processUList, 'ol': self._processOList, 'quoted': self._processQuote, 'tabbed': self._processCodeBlock} for regexp in ['ul', 'ol', 'quoted', 'tabbed']: m = RE.regExp[regexp].match(lines[0]) if m: processFn[regexp](parent_elem, lines, inList) return # We are NOT looking at one of the high-level structures like # lists or blockquotes. So, it's just a regular paragraph # (though perhaps nested inside a list or something else). If # we are NOT inside a list, we just need to look for a blank # line to find the end of the block. If we ARE inside a # list, however, we need to consider that a sublist does not # need to be separated by a blank line. Rather, the following # markup is legal: # # * The top level list item # # Another paragraph of the list. This is where we are now. # * Underneath we might have a sublist. # if inList: start, lines = self._linesUntil(lines, (lambda line: RE.regExp['ul'].match(line) or RE.regExp['ol'].match(line) or not line.strip())) self._processSection(parent_elem, start, inList - 1, looseList = looseList) inList = inList-1 else: # Ok, so it's just a simple block paragraph, lines = self._linesUntil(lines, lambda line: not line.strip()) if len(paragraph) and paragraph[0].startswith('#'): self._processHeader(parent_elem, paragraph) elif paragraph: self._processParagraph(parent_elem, paragraph, inList, looseList) if lines and not lines[0].strip(): lines = lines[1:] # skip the first (blank) line def _processHeader(self, parent_elem, paragraph): m = RE.regExp['header'].match(paragraph[0]) if m: level = len(m.group(1)) h = self.doc.createElement("h%d" % level) parent_elem.appendChild(h) for item in self._handleInline(m.group(2).strip()): h.appendChild(item) else: message(CRITICAL, "We've got a problem header!") def _processParagraph(self, parent_elem, paragraph, inList, looseList): list = self._handleInline("\n".join(paragraph)) if ( parent_elem.nodeName == 'li' and not (looseList or parent_elem.childNodes)): # If this is the first paragraph inside "li", don't # putaround it - append the paragraph bits directly # onto parent_elem el = parent_elem else: # Otherwise make a "p" element el = self.doc.createElement("p") parent_elem.appendChild(el) for item in list: el.appendChild(item) def _processUList(self, parent_elem, lines, inList): self._processList(parent_elem, lines, inList, listexpr='ul', tag = 'ul') def _processOList(self, parent_elem, lines, inList): self._processList(parent_elem, lines, inList, listexpr='ol', tag = 'ol') def _processList(self, parent_elem, lines, inList, listexpr, tag): """ Given a list of document lines starting with a list item, finds the end of the list, breaks it up, and recursively processes each list item and the remainder of the text file. Keyword arguments: * parent_elem: A dom element to which the content will be added * lines: a list of lines * inList: a level Returns: None """ ul = self.doc.createElement(tag) # ul might actually be '
") and recursively processes the
the remainder of the text file.
Keyword arguments:
* parent_elem: DOM element to which the content will be added
* lines: a list of lines
* inList: a level
Returns: None
"""
detabbed, theRest = self.blockGuru.detectTabbed(lines)
pre = self.doc.createElement('pre')
code = self.doc.createElement('code')
parent_elem.appendChild(pre)
pre.appendChild(code)
text = "\n".join(detabbed).rstrip()+"\n"
#text = text.replace("&", "&")
code.appendChild(self.doc.createTextNode(text))
self._processSection(parent_elem, theRest, inList)
def _handleInline (self, line, patternIndex=0):
"""
Transform a Markdown line with inline elements to an XHTML
fragment.
This function uses auxiliary objects called inline patterns.
See notes on inline patterns above.
Keyword arguments:
* line: A line of Markdown text
* patternIndex: The index of the inlinePattern to start with
Return: A list of NanoDom nodes
"""
parts = [line]
while patternIndex < len(self.inlinePatterns):
i = 0
while i < len(parts):
x = parts[i]
if isinstance(x, (str, unicode)):
result = self._applyPattern(x, \
self.inlinePatterns[patternIndex], \
patternIndex)
if result:
i -= 1
parts.remove(x)
for y in result:
parts.insert(i+1,y)
i += 1
patternIndex += 1
for i in range(len(parts)):
x = parts[i]
if isinstance(x, (str, unicode)):
parts[i] = self.doc.createTextNode(x)
return parts
def _applyPattern(self, line, pattern, patternIndex):
"""
Given a pattern name, this function checks if the line
fits the pattern, creates the necessary elements, and returns
back a list consisting of NanoDom elements and/or strings.
Keyword arguments:
* line: the text to be processed
* pattern: the pattern to be checked
Returns: The appropriate newly created NanoDom element if the
pattern matches, None otherwise.
"""
# match the line to pattern's pre-compiled reg exp.
# if no match, move on.
m = pattern.getCompiledRegExp().match(line)
if not m:
return None
# if we got a match let the pattern make us a NanoDom node
# if it doesn't, move on
node = pattern.handleMatch(m, self.doc)
# check if any of this nodes have children that need processing
if isinstance(node, Element):
if not node.nodeName in ["code", "pre"]:
for child in node.childNodes:
if isinstance(child, TextNode):
result = self._handleInline(child.value, patternIndex+1)
if result:
if result == [child]:
continue
result.reverse()
#to make insertion easier
position = node.childNodes.index(child)
node.removeChild(child)
for item in result:
if isinstance(item, (str, unicode)):
if len(item) > 0:
node.insertChild(position,
self.doc.createTextNode(item))
else:
node.insertChild(position, item)
if node:
# Those are in the reverse order!
return ( m.groups()[-1], # the string to the left
node, # the new node
m.group(1)) # the string to the right of the match
else:
return None
def convert (self, source=None):
"""
Return the document in XHTML format.
Keyword arguments:
* source: An ascii or unicode string of Markdown formated text.
Returns: A serialized XHTML body.
"""
if source is not None: #Allow blank string
self.source = source
if not self.source:
return u""
try:
self.source = unicode(self.source)
except UnicodeDecodeError:
message(CRITICAL, 'UnicodeDecodeError: Markdown only accepts unicode or ascii input.')
return u""
# Fixup the source text
self.source = self.source.replace("\r\n", "\n").replace("\r", "\n")
self.source += "\n\n"
self.source = self.source.expandtabs(TAB_LENGTH)
for pp in self.textPreprocessors:
self.source = pp.run(self.source)
doc = self._transform()
xml = doc.toxml()
# Return everything but the top level tag
if self.stripTopLevelTags:
xml = xml.strip()[23:-7] + "\n"
for pp in self.textPostprocessors:
xml = pp.run(xml)
return (self.docType + xml).strip()
def __str__(self):
''' Report info about instance. Markdown always returns unicode. '''
if self.source is None:
status = 'in which no source text has been assinged.'
else:
status = 'which contains %d chars and %d line(s) of source.'%\
(len(self.source), self.source.count('\n')+1)
return 'An instance of "%s" %s'% (self.__class__, status)
__unicode__ = convert # markdown should always return a unicode string
# ====================================================================
def markdownFromFile(input = None,
output = None,
extensions = [],
encoding = None,
message_threshold = CRITICAL,
safe = False):
"""
Convenience wrapper function that takes a filename as input.
Used from the command-line, although may be useful in other situations.
Decodes the file using the provided encoding (defaults to utf-8), passes
the file content to markdown, and outputs the html to either the provided
filename or stdout in the same encoding as the source file.
**Note:** This is the only place that decoding and encoding takes place
in Python-Markdown.
Keyword arguments:
* input: Name of source text file.
* output: Name of output file. Writes to stdout if `None`.
* extensions: A list of extension names (may contain config args).
* encoding: Encoding of input and output files. Defaults to utf-8.
* message_threshold: Error reporting level.
* safe_mode: Disallow raw html. One of "remove", "replace" or "escape".
Returns: An HTML document as a string.
"""
global console_hndlr
console_hndlr.setLevel(message_threshold)
message(DEBUG, "input file: %s" % input)
if not encoding:
encoding = "utf-8"
input_file = codecs.open(input, mode="r", encoding=encoding)
text = input_file.read()
input_file.close()
text = removeBOM(text, encoding)
new_text = markdown(text, extensions, safe_mode = safe)
if output:
output_file = codecs.open(output, "w", encoding=encoding)
output_file.write(new_text)
output_file.close()
else:
sys.stdout.write(new_text.encode(encoding))
def markdown(text,
extensions = [],
safe_mode = False):
"""
Convenience wrapper function for `Markdown` class.
Useful in a typical use case. Initializes an instance of the `Markdown`
class, loads any extensions and runs the parser on the given text.
Keyword arguments:
* text: An ascii or Unicode string of Markdown formatted text.
* extensions: A list of extension names (may contain config args).
* safe_mode: Disallow raw html. One of "remove", "replace" or "escape".
Returns: An HTML document as a string.
"""
message(DEBUG, "in markdown.markdown(), received text:\n%s" % text)
extensions = [load_extension(e) for e in extensions]
md = Markdown(extensions=extensions,
safe_mode = safe_mode)
return md.convert(text)
class Extension:
def __init__(self, configs = {}):
self.config = configs
def getConfig(self, key):
if self.config.has_key(key):
return self.config[key][0]
else:
return ""
def getConfigInfo(self):
return [(key, self.config[key][1]) for key in self.config.keys()]
def setConfig(self, key, value):
self.config[key][0] = value
def load_extension(ext_name, configs = []):
"""
Load extension by name, then return the module.
The extension name may contain arguments as part of the string in the
following format:
"extname(key1=value1,key2=value2)"
Print an error message and exit on failure.
"""
# I am making the assumption that the order of config options
# does not matter.
configs = dict(configs)
pos = ext_name.find("(")
if pos > 0:
ext_args = ext_name[pos+1:-1]
ext_name = ext_name[:pos]
pairs = [x.split("=") for x in ext_args.split(",")]
configs.update([(x.strip(), y.strip()) for (x, y) in pairs])
extension_module_name = "mdx_" + ext_name
try:
module = __import__(extension_module_name)
except:
message(CRITICAL,
"couldn't load extension %s (looking for %s module)"
% (ext_name, extension_module_name) )
sys.exit(1)
return module.makeExtension(configs.items())
OPTPARSE_WARNING = """
Python 2.3 or higher required for advanced command line options.
For lower versions of Python use:
%s INPUT_FILE > OUTPUT_FILE
""" % EXECUTABLE_NAME_FOR_USAGE
def parse_options():
"""
Define and parse `optparse` options for command-line usage.
"""
try:
optparse = __import__("optparse")
except:
if len(sys.argv) == 2:
return {'input': sys.argv[1],
'output': None,
'message_threshold': CRITICAL,
'safe': False,
'extensions': [],
'encoding': None }
else:
print OPTPARSE_WARNING
return None
parser = optparse.OptionParser(usage="%prog INPUTFILE [options]")
parser.add_option("-f", "--file", dest="filename",
help="write output to OUTPUT_FILE",
metavar="OUTPUT_FILE")
parser.add_option("-e", "--encoding", dest="encoding",
help="encoding for input and output files",)
parser.add_option("-q", "--quiet", default = CRITICAL,
action="store_const", const=60, dest="verbose",
help="suppress all messages")
parser.add_option("-v", "--verbose",
action="store_const", const=INFO, dest="verbose",
help="print info messages")
parser.add_option("-s", "--safe", dest="safe", default=False,
metavar="SAFE_MODE",
help="safe mode ('replace', 'remove' or 'escape' user's HTML tag)")
parser.add_option("--noisy",
action="store_const", const=DEBUG, dest="verbose",
help="print debug messages")
parser.add_option("-x", "--extension", action="append", dest="extensions",
help = "load extension EXTENSION", metavar="EXTENSION")
(options, args) = parser.parse_args()
if not len(args) == 1:
parser.print_help()
return None
else:
input_file = args[0]
if not options.extensions:
options.extensions = []
return {'input': input_file,
'output': options.filename,
'message_threshold': options.verbose,
'safe': options.safe,
'extensions': options.extensions,
'encoding': options.encoding }
if __name__ == '__main__':
""" Run Markdown from the command line. """
options = parse_options()
#if os.access(inFile, os.R_OK):
if not options:
sys.exit(0)
markdownFromFile(**options)