#!/usr/bin/env python """ ==================================================================== IF YOU ARE LOOKING TO EXTEND MARKDOWN, SEE THE "FOOTNOTES" SECTION ==================================================================== Python-Markdown =============== Converts Markdown to HTML. Basic usage as a module: import markdown html = markdown.markdown(your_text_string) Started by [Manfred Stienstra](http://www.dwerg.net/). Continued and maintained by [Yuri Takhteyev](http://www.freewisdom.org). Project website: http://www.freewisdom.org/projects/python-markdown Contact: yuri [at] freewisdom.org License: GPL 2 (http://www.gnu.org/copyleft/gpl.html) or BSD Version: 1.3 (Feb. 28, 2006) For changelog, see end of file """ import re, sys, os, random # set debug level: 3 none, 2 critical, 1 informative, 0 all (VERBOSE, INFO, CRITICAL, NONE) = range(4) MESSAGE_THRESHOLD = CRITICAL def message(level, text) : if level >= MESSAGE_THRESHOLD : print text # --------------- CONSTANTS YOU MIGHT WANT TO MODIFY ----------------- # all tabs will be expanded to up to this many spaces TAB_LENGTH = 4 ENABLE_ATTRIBUTES = 1 SMART_EMPHASIS = 1 # --------------- CONSTANTS YOU _SHOULD NOT_ HAVE TO CHANGE ---------- FN_BACKLINK_TEXT = "zz1337820767766393qq" # a template for html placeholders HTML_PLACEHOLDER_PREFIX = "qaodmasdkwaspemas" HTML_PLACEHOLDER = HTML_PLACEHOLDER_PREFIX + "%dajkqlsmdqpakldnzsdfls" BLOCK_LEVEL_ELEMENTS = ['p', 'div', 'blockquote', 'pre', 'table', 'dl', 'ol', 'ul', 'script', 'noscript', 'form', 'fieldset', 'iframe', 'math', 'ins', 'del', 'hr', 'hr/'] def is_block_level (tag) : return ( (tag in BLOCK_LEVEL_ELEMENTS) or (tag[0] == 'h' and tag[1] in "0123456789") ) """ ====================================================================== ========================== NANODOM =================================== ====================================================================== The three classes below implement some of the most basic DOM methods. I use this instead of minidom because I need a simpler functionality and do not want to require additional libraries. Importantly, NanoDom does not do normalization, which is what we want. It also adds extra white space when converting DOM to string """ class Document : def appendChild(self, child) : self.documentElement = child child.parent = self self.entities = {} def createElement(self, tag, textNode=None) : el = Element(tag) el.doc = self if textNode : el.appendChild(self.createTextNode(textNode)) return el def createTextNode(self, text) : node = TextNode(text) node.doc = self return node def createEntityReference(self, entity): if entity not in self.entities: self.entities[entity] = EntityReference(entity) return self.entities[entity] def toxml (self) : return self.documentElement.toxml() def normalizeEntities(self, text) : pairs = [ #("&", "&"), ("<", "<"), (">", ">"), ("\"", """)] for old, new in pairs : text = text.replace(old, new) return text def find(self, test) : return self.documentElement.find(test) def unlink(self) : self.documentElement.unlink() self.documentElement = None class Element : type = "element" def __init__ (self, tag) : self.nodeName = tag self.attributes = [] self.attribute_values = {} self.childNodes = [] def unlink(self) : for child in self.childNodes : if child.type == "element" : child.unlink() self.childNodes = None def setAttribute(self, attr, value) : if not attr in self.attributes : self.attributes.append(attr) self.attribute_values[attr] = value def insertChild(self, position, child) : self.childNodes.insert(position, child) child.parent = self def removeChild(self, child) : self.childNodes.remove(child) def replaceChild(self, oldChild, newChild) : position = self.childNodes.index(oldChild) self.removeChild(oldChild) self.insertChild(position, newChild) def appendChild(self, child) : self.childNodes.append(child) child.parent = self def handleAttributes(self) : pass def find(self, test, depth=0) : """ Returns a list of descendants that pass the test function """ matched_nodes = [] for child in self.childNodes : if test(child) : matched_nodes.append(child) if child.type == "element" : matched_nodes += child.find(test, depth+1) return matched_nodes def toxml(self): if ENABLE_ATTRIBUTES : for child in self.childNodes: child.handleAttributes() buffer = "" if self.nodeName in ['h1', 'h2', 'h3', 'h4'] : buffer += "\n" elif self.nodeName in ['li'] : buffer += "\n " buffer += "<" + self.nodeName for attr in self.attributes : value = self.attribute_values[attr] value = self.doc.normalizeEntities(value) buffer += ' %s="%s"' % (attr, value) if self.childNodes : buffer += ">" for child in self.childNodes : buffer += child.toxml() if self.nodeName == 'p' : buffer += "\n" elif self.nodeName == 'li' : buffer += "\n " buffer += "%s>" % self.nodeName else : buffer += "/>" if self.nodeName in ['p', 'li', 'ul', 'ol', 'h1', 'h2', 'h3', 'h4'] : buffer += "\n" return buffer class TextNode : type = "text" attrRegExp = re.compile(r'\{@([^\}]*)=([^\}]*)}') # {@id=123} def __init__ (self, text) : self.value = text def attributeCallback(self, match) : self.parent.setAttribute(match.group(1), match.group(2)) def handleAttributes(self) : self.value = self.attrRegExp.sub(self.attributeCallback, self.value) def toxml(self) : text = self.value if not text.startswith(HTML_PLACEHOLDER_PREFIX): if self.parent.nodeName == "p" : text = text.replace("\n", "\n ") elif (self.parent.nodeName == "li" and self.parent.childNodes[0]==self): text = "\n " + text.replace("\n", "\n ") text = self.doc.normalizeEntities(text) return text class EntityReference: type = "entity_ref" def __init__(self, entity): self.entity = entity def handleAttributes(self): pass def toxml(self): return "&" + self.entity + ";" """ ====================================================================== ========================== PRE-PROCESSORS ============================ ====================================================================== Preprocessors munge source text before we start doing anything too complicated. Each preprocessor implements a "run" method that takes a pointer to a list of lines of the document, modifies it as necessary and returns either the same pointer or a pointer to a new list. """ class HeaderPreprocessor : """ Replaces underlined headers with hashed headers to avoid the nead for lookahead later. """ def run (self, lines) : for i in range(len(lines)) : if not lines[i] : continue if (i+1 <= len(lines) and lines[i+1] and lines[i+1][0] in ['-', '=']) : underline = lines[i+1].strip() if underline == "="*len(underline) : lines[i] = "# " + lines[i].strip() lines[i+1] = "" elif underline == "-"*len(underline) : lines[i] = "## " + lines[i].strip() lines[i+1] = "" return lines HEADER_PREPROCESSOR = HeaderPreprocessor() class LinePreprocessor : """Deals with HR lines (needs to be done before processing lists)""" def run (self, lines) : for i in range(len(lines)) : if self._isLine(lines[i]) : lines[i] = "
around it - append the paragraph bits directly # onto parent_elem el = parent_elem else : # Otherwise make a "p" element el = self.doc.createElement("p") parent_elem.appendChild(el) for item in list : el.appendChild(item) if theRest : theRest = theRest[1:] # skip the first (blank) line self._processSection(parent_elem, theRest, inList) def _processUList(self, parent_elem, lines, inList) : self._processList(parent_elem, lines, inList, listexpr='ul', tag = 'ul') def _processOList(self, parent_elem, lines, inList) : self._processList(parent_elem, lines, inList, listexpr='ol', tag = 'ol') def _processList(self, parent_elem, lines, inList, listexpr, tag) : """Given a list of document lines starting with a list item, finds the end of the list, breaks it up, and recursively processes each list item and the remainder of the text file. @param parent_elem: A dom element to which the content will be added @param lines: a list of lines @param inList: a level @returns: None""" ul = self.doc.createElement(tag) # ul might actually be '
") and recursively processes the
the remainder of the text file.
@param parent_elem: DOM element to which the content will be added
@param lines: a list of lines
@param inList: a level
@returns: None"""
detabbed, theRest = self.blockGuru.detectTabbed(lines)
pre = self.doc.createElement('pre')
code = self.doc.createElement('code')
parent_elem.appendChild(pre)
pre.appendChild(code)
text = "\n".join(detabbed).rstrip()+"\n"
text = text.replace("&", "&")
code.appendChild(self.doc.createTextNode(text))
self._processSection(parent_elem, theRest, inList)
def _handleInline(self, line):
"""Transform a Markdown line with inline elements to an XHTML fragment.
Note that this function works recursively: we look for a
pattern, which usually splits the paragraph in half, and then
call this function on the two parts.
This function uses auxiliary objects called inline patterns.
See notes on inline patterns above.
@param item: A block of Markdown text
@return: A list of NanoDomnodes """
if not(line):
return [self.doc.createTextNode(' ')]
# two spaces at the end of the line denote a
#if line.endswith(' '):
# list = self._handleInline( line.rstrip())
# list.append(self.doc.createElement('br'))
# return list
#
# ::TODO:: Replace with a preprocessor
for pattern in self.inlinePatterns :
list = self._applyPattern( line, pattern)
if list: return list
return [self.doc.createTextNode(line)]
def _applyPattern(self, line, pattern) :
""" Given a pattern name, this function checks if the line
fits the pattern, creates the necessary elements and
recursively calls _handleInline (via. _inlineRecurse)
@param line: the text to be processed
@param pattern: the pattern to be checked
@returns: the appropriate newly created NanoDom element if the
pattern matches, None otherwise.
"""
# match the line to pattern's pre-compiled reg exp.
# if no match, move on.
m = pattern.getCompiledRegExp().match(line)
if not m :
return None
# if we got a match let the pattern make us a NanoDom node
# if it doesn't, move on
node = pattern.handleMatch(m, self.doc)
if not node :
return None
# determine what we've got to the left and to the right
left = m.group(1) # the first match group
left_list = self._handleInline(left)
right = m.groups()[-1] # the last match group
right_list = self._handleInline(right)
# put the three parts together
left_list.append(node)
left_list.extend(right_list)
return left_list
def __str__(self):
"""Return the document in XHTML format.
@returns: A serialized XHTML body."""
#try :
doc = self._transform()
xml = doc.toxml()
#finally:
# doc.unlink()
# Let's stick in all the raw html pieces
for i in range(self.htmlStash.html_counter) :
xml = xml.replace("%s\n
" % (HTML_PLACEHOLDER % i),
self.htmlStash.rawHtmlBlocks[i] + "\n")
xml = xml.replace(HTML_PLACEHOLDER % i,
self.htmlStash.rawHtmlBlocks[i])
xml = xml.replace(FN_BACKLINK_TEXT, "↩")
# And return everything but the top level tag
if self.stripTopLevelTags :
xml = xml.strip()[23:-7]
return xml
toString = __str__
"""
========================= FOOTNOTES =================================
This section adds footnote handling to markdown. It can be used as
an example for extending python-markdown with relatively complex
functionality. While in this case the extension is included inside
the module itself, it could just as easily be added from outside the
module. Not that all markdown classes above are ignorant about
footnotes. All footnote functionality is provided separately and
then added to the markdown instance at the run time.
Footnote functionality is attached by calling extendMarkdown()
method of FootnoteExtension. The method also registers the
extension to allow it's state to be reset by a call to reset()
method.
"""
class FootnoteExtension :
DEF_RE = re.compile(r'(\ ?\ ?\ ?)\[\^([^\]]*)\]:\s*(.*)')
SHORT_USE_RE = re.compile(r'\[\^([^\]]*)\]', re.M) # [^a]
def __init__ (self) :
self.reset()
def extendMarkdown(self, md) :
self.md = md
# Stateless extensions do not need to be registered
md.registerExtension(self)
# Insert a preprocessor before ReferencePreprocessor
index = md.preprocessors.index(REFERENCE_PREPROCESSOR)
preprocessor = FootnotePreprocessor(self)
preprocessor.md = md
md.preprocessors.insert(index, preprocessor)
# Insert an inline pattern before ImageReferencePattern
FOOTNOTE_RE = r'\[\^([^\]]*)\]' # blah blah [^1] blah
index = md.inlinePatterns.index(IMAGE_REFERENCE_PATTERN)
md.inlinePatterns.insert(index, FootnotePattern(FOOTNOTE_RE, self))
# Insert a post-processor that would actually add the footnote div
md.postprocessors.append(FootnotePostprocessor(self))
def reset(self) :
# May be called by Markdown is state reset is desired
self.footnote_suffix = "-" + str(int(random.random()*1000000000))
self.used_footnotes={}
self.footnotes = {}
def setFootnote(self, id, text) :
self.footnotes[id] = text
def makeFootnoteId(self, num) :
return 'fn%d%s' % (num, self.footnote_suffix)
def makeFootnoteRefId(self, num) :
return 'fnr%d%s' % (num, self.footnote_suffix)
def makeFootnotesDiv (self, doc) :
"""Creates the div with class='footnote' and populates it with
the text of the footnotes.
@returns: the footnote div as a dom element """
if not self.footnotes.keys() :
return None
div = doc.createElement("div")
div.setAttribute('class', 'footnote')
hr = doc.createElement("hr")
div.appendChild(hr)
ol = doc.createElement("ol")
div.appendChild(ol)
footnotes = [(self.used_footnotes[id], id)
for id in self.footnotes.keys()]
footnotes.sort()
for i, id in footnotes :
li = doc.createElement('li')
li.setAttribute('id', self.makeFootnoteId(i))
self.md._processSection(li, self.footnotes[id].split("\n"))
#li.appendChild(doc.createTextNode(self.footnotes[id]))
backlink = doc.createElement('a')
backlink.setAttribute('href', '#' + self.makeFootnoteRefId(i))
backlink.setAttribute('class', 'footnoteBackLink')
backlink.setAttribute('title',
'Jump back to footnote %d in the text' % 1)
backlink.appendChild(doc.createTextNode(FN_BACKLINK_TEXT))
if li.childNodes :
node = li.childNodes[-1]
if node.type == "text" :
node = li
node.appendChild(backlink)
ol.appendChild(li)
return div
class FootnotePreprocessor :
def __init__ (self, footnotes) :
self.footnotes = footnotes
def run(self, lines) :
self.blockGuru = BlockGuru()
lines = self._handleFootnoteDefinitions (lines)
# Make a hash of all footnote marks in the text so that we
# know in what order they are supposed to appear. (This
# function call doesn't really substitute anything - it's just
# a way to get a callback for each occurence.
text = "\n".join(lines)
self.footnotes.SHORT_USE_RE.sub(self.recordFootnoteUse, text)
return text.split("\n")
def recordFootnoteUse(self, match) :
id = match.group(1)
id = id.strip()
nextNum = len(self.footnotes.used_footnotes.keys()) + 1
self.footnotes.used_footnotes[id] = nextNum
def _handleFootnoteDefinitions(self, lines) :
"""Recursively finds all footnote definitions in the lines.
@param lines: a list of lines of text
@returns: a string representing the text with footnote
definitions removed """
i, id, footnote = self._findFootnoteDefinition(lines)
if id :
plain = lines[:i]
detabbed, theRest = self.blockGuru.detectTabbed(lines[i+1:])
self.footnotes.setFootnote(id,
footnote + "\n"
+ "\n".join(detabbed))
more_plain = self._handleFootnoteDefinitions(theRest)
return plain + [""] + more_plain
else :
return lines
def _findFootnoteDefinition(self, lines) :
"""Finds the first line of a footnote definition.
@param lines: a list of lines of text
@returns: the index of the line containing a footnote definition """
counter = 0
for line in lines :
m = self.footnotes.DEF_RE.match(line)
if m :
return counter, m.group(2), m.group(3)
counter += 1
return counter, None, None
class FootnotePattern (BasePattern) :
def __init__ (self, pattern, footnotes) :
BasePattern.__init__(self, pattern)
self.footnotes = footnotes
def handleMatch(self, m, doc) :
sup = doc.createElement('sup')
a = doc.createElement('a')
sup.appendChild(a)
id = m.group(2)
num = self.footnotes.used_footnotes[id]
sup.setAttribute('id', self.footnotes.makeFootnoteRefId(num))
a.setAttribute('href', '#' + self.footnotes.makeFootnoteId(num))
a.appendChild(doc.createTextNode(str(num)))
return sup
class FootnotePostprocessor :
def __init__ (self, footnotes) :
self.footnotes = footnotes
def run(self, doc) :
footnotesDiv = self.footnotes.makeFootnotesDiv(doc)
if footnotesDiv :
doc.documentElement.appendChild(footnotesDiv)
# ====================================================================
def markdown(text) :
message(VERBOSE, "in markdown.py, received text:\n%s" % text)
return str(Markdown(text))
def markdownWithFootnotes(text):
message(VERBOSE, "Running markdown with footnotes, "
+ "received text:\n%s" % text)
md = Markdown()
footnoteExtension = FootnoteExtension()
footnoteExtension.extendMarkdown(md)
md.source = text
return str(md)
def test_markdown(args):
"""test markdown at the command line.
in each test, arg 0 is the module name"""
print "\nTEST 1: no arguments on command line"
cmd_line(["markdown.py"])
print "\nTEST 2a: 1 argument on command line: a good option"
cmd_line(["markdown.py","-footnotes"])
print "\nTEST 2b: 1 argument on command line: a bad option"
cmd_line(["markdown.py","-foodnotes"])
print "\nTEST 3: 1 argument on command line: non-existent input file"
cmd_line(["markdown.py","junk.txt"])
print "\nTEST 4: 1 argument on command line: existing input file"
lines = """
Markdown text with[^1]:
2. **bold text**,
3. *italic text*.
Then more:
beginning of code block;
another line of code block.
a second paragraph of code block.
more text to end our file.
[^1]: "italic" means emphasis.
"""
fid = "markdown-test.txt"
f1 = open(fid, 'w+')
f1.write(lines)
f1.close()
cmd_line(["markdown.py",fid])
print "\nTEST 5: 2 arguments on command line: nofootnotes and input file"
cmd_line(["markdown.py","-nofootnotes", fid])
print "\nTEST 6: 2 arguments on command line: footnotes and input file"
cmd_line(["markdown.py","-footnotes", fid])
print "\nTEST 7: 3 arguments on command line: nofootnotes,inputfile, outputfile"
fidout = "markdown-test.html"
cmd_line(["markdown.py","-nofootnotes", fid, fidout])
def get_vars(args):
"""process the command-line args received; return usable variables"""
#firstly get the variables
message(VERBOSE, "in get_vars(), args: %s" % args)
if len(args) <= 1:
option, inFile, outFile = (None, None, None)
elif len(args) >= 4:
option, inFile, outFile = args[1:4]
elif len(args) == 3:
temp1, temp2 = args[1:3]
if temp1[0] == '-':
#then we have an option and inFile
option, inFile, outFile = temp1, temp2, None
else:
#we have no option, so we must have inFile and outFile
option, inFile, outFile = None, temp1, temp2
else:
#len(args) = 2
#we have only one usable arg: might be an option or a file
temp1 = args[1]
message(VERBOSE, "our single arg is: %s" % str(temp1))
if temp1[0] == '-':
#then we have an option
option, inFile, outFile = temp1, None, None
else:
#we have no option, so we must have inFile
option, inFile, outFile = None, temp1, None
message(VERBOSE,
"prior to validation, option: %s, inFile: %s, outFile: %s" %
(str(option), str(inFile), str(outFile),))
return option, inFile, outFile
USAGE = """
\nUsing markdown.py:
python markdown.py [option] input_file_with_markdown.txt [output_file.html]
Options:
-footnotes or -fn : generate markdown with footnotes
-test or -t : run a self-test
-help or -h : print this message
"""
VALID_OPTIONS = ['footnotes','nofootnotes', 'fn', 'test', 't', 'f',
'help', 'h']
EXPANDED_OPTIONS = { "fn" : "footnotes",
"t" : "test",
"h" : "help" }
def validate_option(option) :
""" Check if the option makes sense and print an appropriate message
if it isn't.
@return: valid option string or None
"""
#now validate the variables
if (option is not None):
if (len(option) > 1 and option[1:] in VALID_OPTIONS) :
option = option[1:]
if option in EXPANDED_OPTIONS.keys() :
option = EXPANDED_OPTIONS[option]
return option
else:
message(CRITICAL,
"\nSorry, I don't understand option %s" % option)
message(CRITICAL, USAGE)
return None
def validate_input_file(inFile) :
""" Check if the input file is specified and exists.
@return: valid input file path or None
"""
if not inFile :
message(CRITICAL,
"\nI need an input filename.\n")
message(CRITICAL, USAGE)
return None
if os.access(inFile, os.R_OK):
return inFile
else :
message(CRITICAL, "Sorry, I can't find input file %s" % str(inFile))
return None
def cmd_line(args):
message(VERBOSE, "in cmd_line with args: %s" % args)
option, inFile, outFile = get_vars(args)
if option :
option = validate_option(option)
if not option : return
if option == "help" :
message(CRITICAL, USAGE)
return
elif option == "test" :
test_markdown(None)
return
inFile = validate_input_file(inFile)
if not inFile :
return
else :
input = file(inFile).read()
message(VERBOSE, "Validated command line parameters:" +
"\n\toption: %s, \n\tinFile: %s, \n\toutFile: %s" % (
str(option), str(inFile), str(outFile),))
if option == "footnotes" :
md_function = markdownWithFootnotes
else :
md_function = markdown
if outFile is None:
print md_function(input)
else:
output = md_function(input)
f1 = open(outFile, "w+")
f1.write(output)
f1.close()
if os.access(outFile, os.F_OK):
message(INFO, "Successfully wrote %s" % outFile)
else:
message(INFO, "Failed to write %s" % outFile)
if __name__ == '__main__':
""" Run Markdown from the command line.
Set debug = 3 at top of file to get diagnostic output"""
args = sys.argv
#set testing=1 to test the command-line response of markdown.py
testing = 0
if testing:
test_markdown(args)
else:
cmd_line(args)
"""
CHANGELOG
=========
Mar. 15, 2006: Replaced some instance variables with class variables
(a patch from Stelios Xanthakis). Chris Clark's new regexps that do
not trigger midword underlining.
Feb. 28, 2006: Clean-up and command-line handling by Stewart
Midwinter. (Version 1.3)
Feb. 24, 2006: Fixed a bug with the last line of the list appearing
again as a separate paragraph. Incorporated Chris Clark's "mailto"
patch. Added support for
at the end of lines ending in two or
more spaces. Fixed a crashing bug when using ImageReferencePattern.
Added several utility methods to Nanodom. (Version 1.2)
Jan. 31, 2006: Added "hr" and "hr/" to BLOCK_LEVEL_ELEMENTS and
changed
to
. (Thanks to Sergej Chodarev.)
Nov. 26, 2005: Fixed a bug with certain tabbed lines inside lists
getting wrapped in . (v. 1.1)
Nov. 19, 2005: Made "