aboutsummaryrefslogtreecommitdiffstats
path: root/markdown.py
diff options
context:
space:
mode:
authorYuri Takhteyev <yuri@freewisdom.org>2008-10-07 01:32:56 -0700
committerYuri Takhteyev <yuri@freewisdom.org>2008-10-07 01:32:56 -0700
commit40b8986ccf0ea3fa37dda469b46261dfbf0c25a4 (patch)
treec29877335eea157e1bfcea87c7332e1ab4f6ff6a /markdown.py
parent6d719bd60b31e7fad3aae345a30f2820e2fd6215 (diff)
downloadmarkdown-40b8986ccf0ea3fa37dda469b46261dfbf0c25a4.tar.gz
markdown-40b8986ccf0ea3fa37dda469b46261dfbf0c25a4.tar.bz2
markdown-40b8986ccf0ea3fa37dda469b46261dfbf0c25a4.zip
All sorts of cleanup.
The bigger changes include getting rid of old BOM-removal logic and getting rid of BlockGuru. Most of the changes are just re-ordering of functions, removal of whitespace, adding comments, etc.
Diffstat (limited to 'markdown.py')
-rwxr-xr-xmarkdown.py687
1 files changed, 291 insertions, 396 deletions
diff --git a/markdown.py b/markdown.py
index 52f278e..95bf61d 100755
--- a/markdown.py
+++ b/markdown.py
@@ -42,174 +42,132 @@ License: BSD (see docs/LICENSE for details).
version = "2.0-alpha"
version_info = (2,0,0, "beta")
-import re, sys, codecs, htmlentitydefs
+import re
+import sys
+import codecs
+import htmlentitydefs
import logging
from logging import DEBUG, INFO, WARN, ERROR, CRITICAL
from urlparse import urlparse, urlunparse
-# --------------- Constants you might want to modify ------------------------
-COMMAND_LINE_LOGGING_LEVEL = CRITICAL
-TAB_LENGTH = 4 # expand tabs to this many spaces
-ENABLE_ATTRIBUTES = True # @id = xyz -> <... id="xyz">
+"""
+CONSTANTS
+=============================================================================
+"""
+
+"""
+Constants you might want to modify
+-----------------------------------------------------------------------------
+"""
+
+# default logging level for command-line use
+COMMAND_LINE_LOGGING_LEVEL = CRITICAL
+TAB_LENGTH = 4 # expand tabs to this many spaces
+ENABLE_ATTRIBUTES = True # @id = xyz -> <... id="xyz">
SMART_EMPHASIS = True # this_or_that does not become this<i>or</i>that
HTML_REMOVED_TEXT = "[HTML_REMOVED]" # text used instead of HTML in safe mode
+BLOCK_LEVEL_ELEMENTS = re.compile("p|div|h[1-6]|blockquote|pre|table|dl|ol|ul"
+ +"|script|noscript|form|fieldset|iframe|math"
+ +"|ins|del|hr|hr/|style|li|tr")
+
+"""
+Constants you probably do not need to change
+-----------------------------------------------------------------------------
+"""
+RTL_BIDI_RANGES = ( (u'\u0590', u'\u07FF'),
+ # Hebrew (0590-05FF), Arabic (0600-06FF),
+ # Syriac (0700-074F), Arabic supplement (0750-077F),
+ # Thaana (0780-07BF), Nko (07C0-07FF).
+ (u'\u2D30', u'\u2D7F'), # Tifinagh
+ )
-# --------------- Auxiliary functions ---------------------------------------
+EXECUTABLE_NAME_FOR_USAGE = "python markdown.py"
+""" The name used in the usage statement displayed for python versions < 2.3.
+(With python 2.3 and higher the usage statement is generated by optparse
+and uses the actual name of the executable called.) """
+
+# Placeholders
+STX = u'\u0002' # Use STX ("Start of text") for start-of-placeholder
+ETX = u'\u0003' # Use ETX ("End of text") for end-of-placeholder
+HTML_PLACEHOLDER_PREFIX = STX+"wzxhzdk:"
+HTML_PLACEHOLDER = HTML_PLACEHOLDER_PREFIX + "%d" + ETX
+INLINE_PLACEHOLDER_PREFIX = STX+"klzzwxh:"
+INLINE_PLACEHOLDER = INLINE_PLACEHOLDER_PREFIX + "%s" + ETX
+AMP_SUBSTITUTE = STX+"amp"+ETX
+
+
+"""
+AUXILIARY GLOBAL FUNCTIONS
+=============================================================================
+"""
def message(level, text):
- ''' A wrapper method for logging debug messages. '''
+ """ A wrapper method for logging debug messages. """
logging.getLogger('MARKDOWN').log(level, text)
-def isstr(s):
+def isString(s):
""" Check if it's string """
return isinstance(s, unicode) or isinstance(s, str)
## Import
def importETree():
- """ Import best variant of ElementTree and return module object """
- cetree = None
- try:
- # Python 2.5+
- import xml.etree.cElementTree as cetree
+ """Import the best implementation of ElementTree, return a module object."""
+ etree_in_c = None
+ try: # Is it Python 2.5+ with C implemenation of ElementTree installed?
+ import xml.etree.cElementTree as etree_in_c
except ImportError:
- try:
- # Python 2.5+
+ try: # Is it Python 2.5+ with Python implementation of ElementTree?
import xml.etree.ElementTree as etree
except ImportError:
- try:
- # normal cElementTree install
- import cElementTree as cetree
+ try: # An earlier version of Python with cElementTree installed?
+ import cElementTree as etree_in_c
except ImportError:
- try:
- # normal ElementTree install
+ try: # An earlier version of Python with Python ElementTree?
import elementtree.ElementTree as etree
except ImportError:
- message(CRITICAL,
- "Failed to import ElementTree from any known place")
+ message(CRITICAL, "Failed to import ElementTree")
sys.exit(1)
- if cetree:
- if cetree.VERSION < "1.0":
- message(CRITICAL,
- "cElementTree version is too old, 1.0 and upper required")
- sys.exit(1)
-
- etree = cetree
- else:
- if etree.VERSION < "1.1":
- message(CRITICAL,
- "ElementTree version is too old, 1.1 and upper required")
- sys.exit(1)
-
- return etree
-
-"""ElementTree module
-in extensions use: `from markdown import etree`
-to access to the ElemetTree module, do not import it by yourself"""
-etree = importETree()
-
-RTL_BIDI_RANGES = ( (u'\u0590', u'\u07FF'),
- # from Hebrew to Nko (includes Arabic, Syriac and Thaana)
- (u'\u2D30', u'\u2D7F'),
- # Tifinagh
- )
-
-# Unicode Reference Table:
-# 0590-05FF - Hebrew
-# 0600-06FF - Arabic
-# 0700-074F - Syriac
-# 0750-077F - Arabic Supplement
-# 0780-07BF - Thaana
-# 07C0-07FF - Nko
-
-BOMS = { 'utf-8': (codecs.BOM_UTF8, ),
- 'utf-16': (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE),
- #'utf-32': (codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE)
- }
-
-def removeBOM(text, encoding):
- """
- Used by `markdownFromFile` to remove a "byte order mark" from the begining
- of an utf-8, utf-16 or utf-32 encoded file.
- """
+ if etree_in_c and etree_in_c.VERSION < "1.0":
+ message(CRITICAL, "For cElementTree version 1.0 or higher is required.")
+ sys.exit(1)
+ elif etree_in_c :
+ return etree_in_c
+ elif etree.VERSION < "1.1":
+ message(CRITICAL, "For ElementTree version 1.1 or higher is required")
+ sys.exit(1)
+ else :
+ return etree
- convert = isinstance(text, unicode)
- for bom in BOMS[encoding]:
- bom = convert and bom.decode(encoding) or bom
- if text.startswith(bom):
- return text.lstrip(bom)
- return text
-
-
-# The following constant specifies the name used in the usage
-# statement displayed for python versions lower than 2.3. (With
-# python2.3 and higher the usage statement is generated by optparse
-# and uses the actual name of the executable called.)
-
-EXECUTABLE_NAME_FOR_USAGE = "python markdown.py"
-
-
-# --------------- CONSTANTS YOU _SHOULD NOT_ HAVE TO CHANGE ----------
-
-
-# placeholders
-STX = u'\u0002' # Use STX ("Start of text") for start-of-placeholder
-ETX = u'\u0003' # Use ETX ("End of text") for end-of-placeholder
-HTML_PLACEHOLDER_PREFIX = STX+"wzxhzdk:"
-HTML_PLACEHOLDER = HTML_PLACEHOLDER_PREFIX + "%d" + ETX
-INLINE_PLACEHOLDER_PREFIX = STX+"klzzwxh:"
-INLINE_PLACEHOLDER = INLINE_PLACEHOLDER_PREFIX + "%s" + ETX
-
-AMP_SUBSTITUTE = STX+"amp"+ETX
-
-BLOCK_LEVEL_ELEMENTS = re.compile('p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del|hr|hr/|style|li|tr')
-
def isBlockLevel(tag):
- """
- Used by HTMLBlockPreprocessor to check if a given tag is a block level
- element.
- """
+ """Check if the tag is a block level HTML tag."""
return BLOCK_LEVEL_ELEMENTS.match(tag)
-
-def codepoint2name(code):
- """
- Return entity definition by code, or code
- if there is no such entity definition
- """
- entity = htmlentitydefs.codepoint2name.get(code)
- if entity:
- return "%s%s;" % (AMP_SUBSTITUTE, entity)
- else:
- return "%s#%d;" % (AMP_SUBSTITUTE, code)
-
def handleAttributes(text, parent):
- """ Handale attributes, e.g {@id=123} """
+ """Set values of an element based on attribute definitions ({@id=123})."""
def attributeCallback(match):
parent.set(match.group(1), match.group(2))
+ return CORE_RE['attr'].sub(attributeCallback, text)
- return RE.regExp['attr'].sub(attributeCallback, text)
-
-
-class AtomicString(unicode):
- "A string which should not be further processed."
- pass
+def dequote(string):
+ """Remove quotes from around a string."""
+ if ( ( string.startswith('"') and string.endswith('"'))
+ or (string.startswith("'") and string.endswith("'")) ):
+ return string[1:-1]
+ else:
+ return string
"""
-======================================================================
-========================== PRE-PROCESSORS ============================
-======================================================================
-
-Preprocessors munge source text before we start doing anything too
-complicated.
-
-There are two types of preprocessors: TextPreprocessor and Preprocessor.
+PRE-PROCESSORS
+=============================================================================
+Preprocessors work on source text before we start doing anything too
+complicated. There are two types of preprocessors: TextPreprocessor and
+Preprocessor.
"""
-
class TextPreprocessor:
"""
TextPreprocessors are run before the text is broken into lines.
@@ -255,27 +213,22 @@ class Preprocessor:
class HtmlBlockPreprocessor(TextPreprocessor):
- """
- Remove html blocks from the source text and store them for later retrieval.
- """
+ """Remove html blocks from the text and store them for later retrieval."""
+
right_tag_patterns = ["</%s>", "%s>"]
def _get_left_tag(self, block):
return block[1:].replace(">", " ", 1).split()[0].lower()
-
- def _get_right_tag(self, left_tag, block):
-
+ def _get_right_tag(self, left_tag, block):
for p in self.right_tag_patterns:
tag = p % left_tag
i = block.rfind(tag)
if i > 2:
return tag.lstrip("<").rstrip(">"), i + len(p)-2 + len(left_tag)
-
return block.rstrip()[-len(left_tag)-2:-1].lower(), len(block)
def _equal_tags(self, left_tag, right_tag):
-
if left_tag == 'div' or left_tag[0] in ['?', '@', '%']: # handle PHP, etc.
return True
if ("/" + left_tag) == right_tag:
@@ -291,17 +244,14 @@ class HtmlBlockPreprocessor(TextPreprocessor):
def _is_oneliner(self, tag):
return (tag in ['hr', 'hr/'])
-
def run(self, text):
- """ Find and remove raw html from text. """
new_blocks = []
- text = text.split("\n\n")
-
+ text = text.split("\n\n")
items = []
left_tag = ''
right_tag = ''
in_tag = False # flag
-
+
while text:
block = text[0]
if block.startswith("\n"):
@@ -312,9 +262,7 @@ class HtmlBlockPreprocessor(TextPreprocessor):
block = block[1:]
if not in_tag:
-
if block.startswith("<"):
-
left_tag = self._get_left_tag(block)
right_tag, data_index = self._get_right_tag(left_tag, block)
@@ -380,14 +328,13 @@ HTML_BLOCK_PREPROCESSOR = HtmlBlockPreprocessor()
class HeaderPreprocessor(Preprocessor):
- """
- Replace underlined headers with hashed headers to avoid
- the need for lookahead later.
+ """Replace underlined headers with hashed headers.
+
+ (To avoid the need for lookahead later.)
+
"""
def run (self, lines):
- """ Find and replace underlined headers. """
-
i = -1
while i+1 < len(lines):
i = i+1
@@ -416,14 +363,10 @@ HEADER_PREPROCESSOR = HeaderPreprocessor()
class LinePreprocessor(Preprocessor):
- """
- Convert HR lines to "___" format
- """
+ """Convert HR lines to "___" format."""
blockquote_re = re.compile(r'^(> )+')
def run (self, lines):
- """ Find and replace HR lines. """
-
for i in range(len(lines)):
prefix = ''
m = self.blockquote_re.search(lines[i])
@@ -441,7 +384,7 @@ class LinePreprocessor(Preprocessor):
if len(text) <= 2:
return False
for pattern in ['isline1', 'isline2', 'isline3']:
- m = RE.regExp[pattern].match(text)
+ m = CORE_RE[pattern].match(text)
if (m and m.group(1)):
return True
else:
@@ -451,16 +394,11 @@ LINE_PREPROCESSOR = LinePreprocessor()
class ReferencePreprocessor(Preprocessor):
- """
- Remove reference definitions from the text and store them for later use.
-
- """
-
+ """Remove reference definitions from the text and store them for later use."""
def run (self, lines):
- """ Remove and store reference defs. """
new_text = [];
for line in lines:
- m = RE.regExp['reference-def'].match(line)
+ m = CORE_RE['reference-def'].match(line)
if m:
id = m.group(2).strip().lower()
t = m.group(4).strip() # potential title
@@ -481,20 +419,21 @@ class ReferencePreprocessor(Preprocessor):
REFERENCE_PREPROCESSOR = ReferencePreprocessor()
+
+
"""
-======================================================================
-========================== INLINE PATTERNS ===========================
-======================================================================
+INLINE PATTERNS
+=============================================================================
Inline patterns such as *emphasis* are handled by means of auxiliary
objects, one per pattern. Pattern objects must be instances of classes
that extend markdown.Pattern. Each pattern object uses a single regular
expression and needs support the following methods:
- pattern.getCompiledRegExp() - returns a regular expression
+ pattern.getCompiledRegExp() # returns a regular expression
- pattern.handleMatch(m) - takes a match object and returns
- a ElementTree element or just plain text
+ pattern.handleMatch(m) # takes a match object and returns
+ # an ElementTree element or just plain text
All of python markdown's built-in patterns subclass from Pattern,
but you can add additional patterns that don't.
@@ -509,20 +448,26 @@ important - e.g. if we first replace http://.../ links with <a> tags
and _then_ try to replace inline html, we would end up with a mess.
So, we apply the expressions in the following order:
- * escape and backticks have to go before everything else, so
- that we can preempt any markdown patterns by escaping them.
+* escape and backticks have to go before everything else, so
+ that we can preempt any markdown patterns by escaping them.
+
+* then we handle auto-links (must be done before inline html)
+
+* then we handle inline HTML. At this point we will simply
+ replace all inline HTML strings with a placeholder and add
+ the actual HTML to a hash.
- * then we handle auto-links (must be done before inline html)
+* then inline images (must be done before links)
- * then we handle inline HTML. At this point we will simply
- replace all inline HTML strings with a placeholder and add
- the actual HTML to a hash.
+* then bracketed links, first regular then reference-style
- * then inline images (must be done before links)
+* finally we apply strong and emphasis
+"""
- * then bracketed links, first regular then reference-style
- * finally we apply strong and emphasis
+"""
+The actual regular expressions for patterns
+-----------------------------------------------------------------------------
"""
NOBRACKET = r'[^\]\[]*'
@@ -558,6 +503,12 @@ ENTITY_RE = r'(&[\#a-zA-Z0-9]*;)' # &amp;
LINE_BREAK_RE = r' \n' # two spaces at end of line
LINE_BREAK_2_RE = r' $' # two spaces at end of text
+
+"""
+The pattern classes
+-----------------------------------------------------------------------------
+"""
+
class Pattern:
"""Base class that inline patterns subclass. """
@@ -581,9 +532,9 @@ class Pattern:
return self.compiled_re
def handleMatch(self, m):
- """
- Return a ElementTree element from the given match. Subclasses should
- override this method.
+ """Return a ElementTree element from the given match.
+
+ Subclasses should override this method.
Keyword arguments:
@@ -639,8 +590,8 @@ class BacktickPattern (Pattern):
class DoubleTagPattern (SimpleTagPattern):
- """
- Return a ElementTree element nested in tag2 nested in tag1.
+ """Return a ElementTree element nested in tag2 nested in tag1.
+
Useful for strong emphasis etc.
"""
@@ -665,7 +616,6 @@ class HtmlPattern (Pattern):
class LinkPattern (Pattern):
""" Return a link element from the given match. """
def handleMatch(self, m):
-
el = etree.Element("a")
el.text = m.group(2)
title = m.group(11)
@@ -796,6 +746,14 @@ class AutomailPattern (Pattern):
if email.startswith("mailto:"):
email = email[len("mailto:"):]
+ def codepoint2name(code):
+ """Return entity definition by code, or the code if not defined."""
+ entity = htmlentitydefs.codepoint2name.get(code)
+ if entity:
+ return "%s%s;" % (AMP_SUBSTITUTE, entity)
+ else:
+ return "%s#%d;" % (AMP_SUBSTITUTE, code)
+
letters = [codepoint2name(ord(letter)) for letter in email]
el.text = AtomicString(''.join(letters))
@@ -831,18 +789,16 @@ AUTOMAIL_PATTERN = AutomailPattern(AUTOMAIL_RE)
"""
-======================================================================
-========================== POST-PROCESSORS ===========================
-======================================================================
+POST-PROCESSORS
+=============================================================================
-Markdown also allows post-processors, which are similar to
-preprocessors in that they need to implement a "run" method. However,
-they are run after core processing.
+Markdown also allows post-processors, which are similar to preprocessors in
+that they need to implement a "run" method. However, they are run after core
+processing.
There are two types of post-processors: Postprocessor and TextPostprocessor
"""
-
class Postprocessor:
"""
Postprocessors are run before the ElementTree serialization.
@@ -863,7 +819,6 @@ class Postprocessor:
pass
-
class TextPostprocessor:
"""
TextPostprocessors are run after the ElementTree it converted back into text.
@@ -884,12 +839,11 @@ class TextPostprocessor:
"""
pass
-class PrettifyPostprocessor(Postprocessor):
- """ Add linebreaks to the html document. """
+class PrettifyPostprocessor(Postprocessor):
+ """Add linebreaks to the html document."""
def _prettifyETree(self, elem):
- """ Recursively add linebreaks to ElementTree children. """
-
+ """Recursively add linebreaks to ElementTree children."""
i = "\n"
if isBlockLevel(elem.tag) and elem.tag not in ['code', 'pre']:
if (not elem.text or not elem.text.strip()) \
@@ -904,8 +858,7 @@ class PrettifyPostprocessor(Postprocessor):
elem.tail = i
def run(self, root):
- """ Add linebreaks to ElementTree root object """
-
+ """.Add linebreaks to ElementTree root object."""
self._prettifyETree(root)
# Do <br />'s seperately as they are often in the middle of
# inline content and missed by _prettifyETree.
@@ -965,11 +918,15 @@ AMPSUBSTITUTETEXTPOSTPROCESSOR = AndSubstitutePostprocessor()
"""
-======================================================================
-========================== MISC AUXILIARY CLASSES ====================
-======================================================================
+MISC AUXILIARY CLASSES
+=============================================================================
"""
+class AtomicString(unicode):
+ """A string which should not be further processed."""
+ pass
+
+
class HtmlStash:
"""
This class is used for stashing HTML objects that we extract
@@ -1004,98 +961,6 @@ class HtmlStash:
self.html_counter = 0
self.rawHtmlBlocks = []
-
-class BlockGuru:
- """ Parse document for block level constructs (paragraphs, lists, etc.)."""
-
- def _findHead(self, lines, fn, allowBlank=0):
-
- """
- Functional magic to help determine boundaries of indented
- blocks.
-
- Keyword arguments:
-
- * lines: an array of strings
- * fn: a function that returns a substring of a string
- if the string matches the necessary criteria
- * allowBlank: specifies whether it's ok to have blank
- lines between matching functions
-
- Returns: a list of post processes items and the unused
- remainder of the original list
-
- """
- items = []
- item = -1
-
- i = 0 # to keep track of where we are
-
- for line in lines:
-
- if not line.strip() and not allowBlank:
- return items, lines[i:]
-
- if not line.strip() and allowBlank:
- # If we see a blank line, this _might_ be the end
- i += 1
-
- # Find the next non-blank line
- for j in range(i, len(lines)):
- if lines[j].strip():
- next = lines[j]
- break
- else:
- # There is no more text => this is the end
- break
-
- # Check if the next non-blank line is still a part of the list
-
- part = fn(next)
-
- if part:
- items.append("")
- continue
- else:
- break # found end of the list
-
- part = fn(line)
-
- if part:
- items.append(part)
- i += 1
- continue
- else:
- return items, lines[i:]
- else:
- i += 1
-
- return items, lines[i:]
-
-
- def detabbed_fn(self, line):
- """ An auxiliary method to be passed to _findHead """
- m = RE.regExp['tabbed'].match(line)
- if m:
- return m.group(4)
- else:
- return None
-
-
- def detectTabbed(self, lines):
- """ Find indented text and remove indent before further proccesing. """
- return self._findHead(lines, self.detabbed_fn,
- allowBlank = 1)
-
-
-def dequote(string):
- """ Removes quotes from around a string """
- if ( ( string.startswith('"') and string.endswith('"'))
- or (string.startswith("'") and string.endswith("'")) ):
- return string[1:-1]
- else:
- return string
-
class InlineStash:
@@ -1150,52 +1015,34 @@ class InlineStash:
self._nodes = {}
"""
-======================================================================
-========================== CORE MARKDOWN =============================
-======================================================================
+CORE MARKDOWN
+=============================================================================
-This stuff is hard, so if you are thinking of extending the syntax,
-see first if you can do it via pre-processors, post-processors,
-inline patterns or a combination of the three.
+The core part is still quite messy, despite substantial refactoring. If you
+are thinking of extending the syntax, see first if you can do it through
+pre-processors, post-processors, inline patterns or a combination of the three.
"""
-class CorePatterns:
- """
- This class is scheduled for removal as part of a refactoring effort.
- """
-
- patterns = {
- 'header': r'(#{1,6})[ \t]*(.*?)[ \t]*(#*)', # # A title
- 'reference-def': r'(\ ?\ ?\ ?)\[([^\]]*)\]:\s*([^ ]*)(.*)',
- # [Google]: http://www.google.com/
- 'containsline': r'([-]*)$|^([=]*)', # -----, =====, etc.
- 'ol': r'[ ]{0,3}[\d]*\.\s+(.*)', # 1. text
- 'ul': r'[ ]{0,3}[*+-]\s+(.*)', # "* text"
- 'isline1': r'(\**)', # ***
- 'isline2': r'(\-*)', # ---
- 'isline3': r'(\_*)', # ___
- 'tabbed': r'((\t)|( ))(.*)', # an indented line
- 'quoted': r'[ ]{0,2}> ?(.*)', # a quoted block ("> ...")
- }
-
- def __init__ (self):
-
- self.regExp = {}
- for key in self.patterns.keys():
- self.regExp[key] = re.compile("^%s$" % self.patterns[key],
- re.DOTALL)
-
- self.regExp['containsline'] = re.compile(r'^([-]*)$|^([=]*)$', re.M)
- self.regExp['attr'] = re.compile("\{@([^\}]*)=([^\}]*)}") # {@id=123}
-
-RE = CorePatterns()
+def _wrapRe(raw_re) : return re.compile("^%s$" % raw_re, re.DOTALL)
+CORE_RE = {
+ 'header': _wrapRe(r'(#{1,6})[ \t]*(.*?)[ \t]*(#*)'), # # A title
+ 'reference-def': _wrapRe(r'(\ ?\ ?\ ?)\[([^\]]*)\]:\s*([^ ]*)(.*)'),
+ # [Google]: http://www.google.com/
+ 'containsline': _wrapRe(r'([-]*)$|^([=]*)'), # -----, =====, etc.
+ 'ol': _wrapRe(r'[ ]{0,3}[\d]*\.\s+(.*)'), # 1. text
+ 'ul': _wrapRe(r'[ ]{0,3}[*+-]\s+(.*)'), # "* text"
+ 'isline1': _wrapRe(r'(\**)'), # ***
+ 'isline2': _wrapRe(r'(\-*)'), # ---
+ 'isline3': _wrapRe(r'(\_*)'), # ___
+ 'tabbed': _wrapRe(r'((\t)|( ))(.*)'), # an indented line
+ 'quoted': _wrapRe(r'[ ]{0,2}> ?(.*)'), # a quoted block ("> ...")
+ 'containsline': re.compile(r'^([-]*)$|^([=]*)$', re.M),
+ 'attr': re.compile("\{@([^\}]*)=([^\}]*)}") # {@id=123}
+}
class Markdown:
- """
- Markdown formatter class for creating an html document from Markdown text.
- """
-
+ """Converts markdown to HTML."""
def __init__(self,
extensions=[],
@@ -1216,7 +1063,6 @@ class Markdown:
"""
self.source = None
self.safeMode = safe_mode
- self.blockGuru = BlockGuru()
self.registeredExtensions = []
self.docType = ""
self.stripTopLevelTags = True
@@ -1397,7 +1243,7 @@ class Markdown:
'tabbed': self._processCodeBlock}
for regexp in ['ul', 'ol', 'quoted', 'tabbed']:
- m = RE.regExp[regexp].match(lines[0])
+ m = CORE_RE[regexp].match(lines[0])
if m:
processFn[regexp](parent_elem, lines, inList)
return
@@ -1420,8 +1266,8 @@ class Markdown:
if inList:
start, lines = self._linesUntil(lines, (lambda line:
- RE.regExp['ul'].match(line)
- or RE.regExp['ol'].match(line)
+ CORE_RE['ul'].match(line)
+ or CORE_RE['ol'].match(line)
or not line.strip()))
self._processSection(parent_elem, start,
@@ -1437,7 +1283,7 @@ class Markdown:
self._processHeader(parent_elem, paragraph)
elif len(paragraph) and \
- RE.regExp["isline3"].match(paragraph[0]):
+ CORE_RE["isline3"].match(paragraph[0]):
self._processHR(parent_elem)
lines = paragraph[1:] + lines
@@ -1453,7 +1299,7 @@ class Markdown:
hr = etree.SubElement(parentElem, "hr")
def _processHeader(self, parentElem, paragraph):
- m = RE.regExp['header'].match(paragraph[0])
+ m = CORE_RE['header'].match(paragraph[0])
if m:
level = len(m.group(1))
h = etree.SubElement(parentElem, "h%d" % level)
@@ -1480,7 +1326,7 @@ class Markdown:
# Searching for hr or header
for line in paragraph:
# it's hr
- if RE.regExp["isline3"].match(line):
+ if CORE_RE["isline3"].match(line):
el.text = "\n".join(dump)
self._processHR(el)
dump = []
@@ -1548,8 +1394,8 @@ class Markdown:
# Check if the next non-blank line is still a part of the list
- if ( RE.regExp[listexpr].match(next) or
- RE.regExp['tabbed'].match(next) ):
+ if ( CORE_RE[listexpr].match(next) or
+ CORE_RE['tabbed'].match(next) ):
# get rid of any white space in the line
items[item].append(line.strip())
looseList = loose or looseList
@@ -1562,7 +1408,7 @@ class Markdown:
for expr in ['ul', 'ol', 'tabbed']:
- m = RE.regExp[expr].match(line)
+ m = CORE_RE[expr].match(line)
if m:
if expr in ['ul', 'ol']: # We are looking at a new item
#if m.group(1) :
@@ -1628,7 +1474,7 @@ class Markdown:
i = 0
blank_line = False # allow one blank line between paragraphs
for line in lines:
- m = RE.regExp['quoted'].match(line)
+ m = CORE_RE['quoted'].match(line)
if m:
dequoted.append(m.group(1))
i += 1
@@ -1667,7 +1513,7 @@ class Markdown:
Returns: None
"""
- detabbed, theRest = self.blockGuru.detectTabbed(lines)
+ detabbed, theRest = self.detectTabbed(lines)
pre = etree.SubElement(parentElem, "pre")
code = etree.SubElement(pre, "code")
@@ -1675,6 +1521,59 @@ class Markdown:
text = "\n".join(detabbed).rstrip()+"\n"
code.text = AtomicString(text)
self._processSection(parentElem, theRest, inList)
+
+ def detectTabbed(self, lines):
+ """ Find indented text and remove indent before further proccesing.
+
+ Keyword arguments:
+
+ * lines: an array of strings
+ * fn: a function that returns a substring of a string
+ if the string matches the necessary criteria
+
+ Returns: a list of post processes items and the unused
+ remainder of the original list
+
+ """
+ items = []
+ item = -1
+ i = 0 # to keep track of where we are
+
+ def detab(line):
+ match = CORE_RE['tabbed'].match(line)
+ if match:
+ return match.group(4)
+
+ for line in lines:
+ if line.strip(): # Non-blank line
+ line = detab(line)
+ if line:
+ items.append(line)
+ i += 1
+ continue
+ else:
+ return items, lines[i:]
+
+ else: # Blank line: _maybe_ we are done.
+ i += 1 # advance
+
+ # Find the next non-blank line
+ for j in range(i, len(lines)):
+ if lines[j].strip():
+ next_line = lines[j]; break
+ else:
+ break # There is no more text; we are done.
+
+ # Check if the next non-blank line is tabbed
+ if detab(next_line): # Yes, more work to do.
+ items.append("")
+ continue
+ else:
+ break # No, we are done.
+ else:
+ i += 1
+
+ return items, lines[i:]
def _handleInline(self, data, patternIndex=0):
"""
@@ -1730,11 +1629,11 @@ class Markdown:
if node is None:
return data, True, len(leftData) + match.span(len(match.groups()))[0]
- if not isstr(node):
+ if not isString(node):
if not isinstance(node.text, AtomicString):
# We need to process current node too
for child in [node] + node.getchildren():
- if not isstr(node):
+ if not isString(node):
if child.text:
child.text = self._handleInline(child.text,
patternIndex + 1)
@@ -1824,14 +1723,14 @@ class Markdown:
text = data[strartIndex:index]
linkText(text)
- if not isstr(node): # it's Element
+ if not isString(node): # it's Element
for child in [node] + node.getchildren():
if child.tail:
if child.tail.strip():
self._processElementText(node, child, False)
-
+
if child.text:
if child.text.strip():
self._processElementText(child, child)
@@ -1872,35 +1771,26 @@ class Markdown:
Returns: ElementTree object with applied inline patterns.
"""
- el = markdownTree.getroot()
-
- stack = [el]
+ stack = [markdownTree.getroot()]
while stack:
currElement = stack.pop()
insertQueue = []
for child in currElement.getchildren():
-
- if not isinstance(child.text, AtomicString) and child.text:
-
+ if child.text and not isinstance(child.text, AtomicString):
text = child.text
child.text = None
lst = self._processPlaceholders(self._handleInline(
text), child)
stack += lst
-
-
insertQueue.append((child, lst))
-
if child.getchildren():
stack.append(child)
-
for element, lst in insertQueue:
if element.text:
- element.text = handleAttributes(element.text,
- element)
+ element.text = handleAttributes(element.text, element)
i = 0
for newChild in lst:
# Processing attributes
@@ -1913,13 +1803,10 @@ class Markdown:
element.insert(i, newChild)
i += 1
-
return markdownTree
-
def markdownToTree(self, source=None):
- """
- Create ElementTree, without applying inline paterns.
+ """Create ElementTree, without applying inline paterns.
Keyword arguments:
@@ -1934,10 +1821,8 @@ class Markdown:
return u""
# Fixup the source text
-
self.source = self.source.replace(STX, "")
self.source = self.source.replace(ETX, "")
-
self.source = self.source.replace("\r\n", "\n").replace("\r", "\n")
self.source += "\n\n"
self.source = self.source.expandtabs(TAB_LENGTH)
@@ -1950,8 +1835,7 @@ class Markdown:
return markdownTree
def convert (self, source):
- """
- Create the document in XHTML format.
+ """Create the document in XHTML format.
Keyword arguments:
@@ -1986,10 +1870,8 @@ class Markdown:
return xml.strip()
-
def __str__(self):
- ''' Report info about instance. Markdown always returns unicode. '''
-
+ """ Report info about instance. Markdown always returns unicode."""
if self.source is None:
status = 'in which no source text has been assinged.'
else:
@@ -2000,26 +1882,29 @@ class Markdown:
__unicode__ = convert # markdown should always return a unicode string
+"""
+EXPORTED FUNCTIONS
+=============================================================================
-
-
-# ====================================================================
+Those are the two functions we really mean to export: markdown() and
+markdownFromFile().
+"""
def markdownFromFile(input = None,
output = None,
extensions = [],
encoding = None,
safe = False):
- """
- Convenience wrapper function that takes a filename as input.
+ """Converts a markdown file and returns the HTML as a unicode string.
Used from the command-line, although may be useful in other situations.
Decodes the file using the provided encoding (defaults to utf-8), passes
the file content to markdown, and outputs the html to either the provided
filename or stdout in the same encoding as the source file.
- **Note:** This is the only place that decoding and encoding takes place
- in Python-Markdown.
+ **Note:** This is the only place that decoding and encoding of unicode
+ takes place in Python-Markdown. (All other code is unicode-in /
+ unicode-out.)
Keyword arguments:
@@ -2029,30 +1914,26 @@ def markdownFromFile(input = None,
* encoding: Encoding of input and output files. Defaults to utf-8.
* safe_mode: Disallow raw html. One of "remove", "replace" or "escape".
- Returns: An HTML document as a string.
-
"""
+
+ encoding = encoding or "utf-8"
- message(DEBUG, "input file: %s" % input)
-
- if not encoding:
- encoding = "utf-8"
-
+ # Read the source
input_file = codecs.open(input, mode="r", encoding=encoding)
text = input_file.read()
input_file.close()
+ text = text.lstrip(u'\ufeff') # remove the byte-order mark
- text = removeBOM(text, encoding)
-
- new_text = markdown(text, extensions, safe_mode = safe)
+ # Convert
+ html = markdown(text, extensions, safe_mode = safe)
+ # Write to file or stdout
if output:
output_file = codecs.open(output, "w", encoding=encoding)
- output_file.write(new_text)
+ output_file.write(html)
output_file.close()
-
else:
- sys.stdout.write(new_text.encode(encoding))
+ sys.stdout.write(html.encode(encoding))
def markdown(text,
extensions = [],
@@ -2082,11 +1963,15 @@ def markdown(text,
return md.convert(text)
+"""
+Extensions
+-----------------------------------------------------------------------------
+"""
+
class Extension:
""" Base class for extensions to subclass. """
def __init__(self, configs = {}):
- """
- Create an instance of an Extention.
+ """Create an instance of an Extention.
Keyword arguments:
@@ -2169,9 +2054,19 @@ def load_extension(ext_name, configs = []):
return module.makeExtension(configs.items())
-#############################################################################
-## Only command-line specific stuff from here down.
-#############################################################################
+# Extensions should use "markdown.etree" instead of "etree" (or do `from
+# markdown import etree`). Do not import it by yourself.
+
+etree = importETree()
+
+
+"""
+COMMAND-LINE SPECIFIC STUFF
+=============================================================================
+
+The rest of the code is specifically for handling the case where Python
+Markdown is called from the command line.
+"""
OPTPARSE_WARNING = """
Python 2.3 or higher required for advanced command line options.