aboutsummaryrefslogtreecommitdiffstats
path: root/scripts/odt2txt.py
diff options
context:
space:
mode:
Diffstat (limited to 'scripts/odt2txt.py')
-rw-r--r--scripts/odt2txt.py465
1 files changed, 0 insertions, 465 deletions
diff --git a/scripts/odt2txt.py b/scripts/odt2txt.py
deleted file mode 100644
index bb8fab0..0000000
--- a/scripts/odt2txt.py
+++ /dev/null
@@ -1,465 +0,0 @@
-"""
-ODT2TXT
-=======
-
-ODT2TXT convers files in Open Document Text format (ODT) into
-Markdown-formatted plain text.
-
-Writteby by [Yuri Takhteyev](http://www.freewisdom.org).
-
-Project website: http://www.freewisdom.org/projects/python-markdown/odt2txt.php
-Contact: yuri [at] freewisdom.org
-
-License: GPL 2 (http://www.gnu.org/copyleft/gpl.html) or BSD
-
-Version: 0.1 (April 7, 2006)
-
-"""
-
-
-
-import sys, zipfile, xml.dom.minidom
-
-IGNORED_TAGS = ["office:annotation"]
-
-FOOTNOTE_STYLES = ["Footnote"]
-
-
-class TextProps :
- """ Holds properties for a text style. """
-
- def __init__ (self):
-
- self.italic = False
- self.bold = False
- self.fixed = False
-
- def setItalic (self, value) :
- if value == "italic" :
- self.italic = True
-
- def setBold (self, value) :
- if value == "bold" :
- self.bold = True
-
- def setFixed (self, value) :
- self.fixed = value
-
- def __str__ (self) :
-
- return "[i=%s, h=i%s, fixed=%s]" % (str(self.italic),
- str(self.bold),
- str(self.fixed))
-
-class ParagraphProps :
- """ Holds properties of a paragraph style. """
-
- def __init__ (self):
-
- self.blockquote = False
- self.headingLevel = 0
- self.code = False
- self.title = False
- self.indented = 0
-
- def setIndented (self, value) :
- self.indented = value
-
- def setHeading (self, level) :
- self.headingLevel = level
-
- def setTitle (self, value):
- self.title = value
-
- def setCode (self, value) :
- self.code = value
-
-
- def __str__ (self) :
-
- return "[bq=%s, h=%d, code=%s]" % (str(self.blockquote),
- self.headingLevel,
- str(self.code))
-
-
-class ListProperties :
- """ Holds properties for a list style. """
-
- def __init__ (self):
- self.ordered = False
-
- def setOrdered (self, value) :
- self.ordered = value
-
-
-
-class OpenDocumentTextFile :
-
-
- def __init__ (self, filepath) :
- self.footnotes = []
- self.footnoteCounter = 0
- self.textStyles = {"Standard" : TextProps()}
- self.paragraphStyles = {"Standard" : ParagraphProps()}
- self.listStyles = {}
- self.fixedFonts = []
- self.hasTitle = 0
-
- self.load(filepath)
-
-
- def processFontDeclarations (self, fontDecl) :
- """ Extracts necessary font information from a font-declaration
- element.
- """
- for fontFace in fontDecl.getElementsByTagName("style:font-face") :
- if fontFace.getAttribute("style:font-pitch") == "fixed" :
- self.fixedFonts.append(fontFace.getAttribute("style:name"))
-
-
-
- def extractTextProperties (self, style, parent=None) :
- """ Extracts text properties from a style element. """
-
- textProps = TextProps()
-
- if parent :
- parentProp = self.textStyles.get(parent, None)
- if parentProp :
- textProp = parentProp
-
- textPropEl = style.getElementsByTagName("style:text-properties")
- if not textPropEl : return textProps
-
- textPropEl = textPropEl[0]
-
- italic = textPropEl.getAttribute("fo:font-style")
- bold = textPropEl.getAttribute("fo:font-weight")
-
- textProps.setItalic(italic)
- textProps.setBold(bold)
-
- if textPropEl.getAttribute("style:font-name") in self.fixedFonts :
- textProps.setFixed(True)
-
- return textProps
-
- def extractParagraphProperties (self, style, parent=None) :
- """ Extracts paragraph properties from a style element. """
-
- paraProps = ParagraphProps()
-
- name = style.getAttribute("style:name")
-
- if name.startswith("Heading_20_") :
- level = name[11:]
- try :
- level = int(level)
- paraProps.setHeading(level)
- except :
- level = 0
-
- if name == "Title" :
- paraProps.setTitle(True)
-
- paraPropEl = style.getElementsByTagName("style:paragraph-properties")
- if paraPropEl :
- paraPropEl = paraPropEl[0]
- leftMargin = paraPropEl.getAttribute("fo:margin-left")
- if leftMargin :
- try :
- leftMargin = float(leftMargin[:-2])
- if leftMargin > 0.01 :
- paraProps.setIndented(True)
- except :
- pass
-
- textProps = self.extractTextProperties(style)
- if textProps.fixed :
- paraProps.setCode(True)
-
- return paraProps
-
-
- def processStyles(self, styleElements) :
- """ Runs through "style" elements extracting necessary information.
- """
-
- for style in styleElements :
-
- name = style.getAttribute("style:name")
-
- if name == "Standard" : continue
-
- family = style.getAttribute("style:family")
- parent = style.getAttribute("style:parent-style-name")
-
- if family == "text" :
- self.textStyles[name] = self.extractTextProperties(style,
- parent)
-
- elif family == "paragraph":
- self.paragraphStyles[name] = (
- self.extractParagraphProperties(style,
- parent))
- def processListStyles (self, listStyleElements) :
-
- for style in listStyleElements :
- name = style.getAttribute("style:name")
-
- prop = ListProperties()
- if style.childNodes :
- if ( style.childNodes[0].tagName
- == "text:list-level-style-number" ) :
- prop.setOrdered(True)
-
- self.listStyles[name] = prop
-
-
- def load(self, filepath) :
- """ Loads an ODT file. """
-
- zip = zipfile.ZipFile(filepath)
-
- styles_doc = xml.dom.minidom.parseString(zip.read("styles.xml"))
- self.processFontDeclarations(styles_doc.getElementsByTagName(
- "office:font-face-decls")[0])
- self.processStyles(styles_doc.getElementsByTagName("style:style"))
- self.processListStyles(styles_doc.getElementsByTagName(
- "text:list-style"))
-
- self.content = xml.dom.minidom.parseString(zip.read("content.xml"))
- self.processFontDeclarations(self.content.getElementsByTagName(
- "office:font-face-decls")[0])
- self.processStyles(self.content.getElementsByTagName("style:style"))
- self.processListStyles(self.content.getElementsByTagName(
- "text:list-style"))
-
- def compressCodeBlocks(self, text) :
- """ Removes extra blank lines from code blocks. """
-
- lines = text.split("\n")
- buffer = ""
- numLines = len(lines)
- for i in range(numLines) :
-
- if (lines[i].strip() or i == numLines-1 or i == 0 or
- not ( lines[i-1].startswith(" ")
- and lines[i+1].startswith(" ") ) ):
- buffer += "\n" + lines[i]
-
- return buffer
-
-
-
- def listToString (self, listElement) :
-
- buffer = ""
-
- styleName = listElement.getAttribute("text:style-name")
- props = self.listStyles.get(styleName, ListProperties())
-
-
-
- i = 0
- for item in listElement.childNodes :
- i += 1
- if props.ordered :
- number = str(i)
- number = number + "." + " "*(2-len(number))
- buffer += number + self.paragraphToString(item.childNodes[0],
- indent=3)
- else :
- buffer += "* " + self.paragraphToString(item.childNodes[0],
- indent=2)
- buffer += "\n\n"
-
- return buffer
-
- def toString (self) :
- """ Converts the document to a string. """
- body = self.content.getElementsByTagName("office:body")[0]
- text = self.content.getElementsByTagName("office:text")[0]
-
- buffer = u""
-
-
- paragraphs = [el for el in text.childNodes
- if el.tagName in ["text:p", "text:h",
- "text:list"]]
-
- for paragraph in paragraphs :
- if paragraph.tagName == "text:list" :
- text = self.listToString(paragraph)
- else :
- text = self.paragraphToString(paragraph)
- if text :
- buffer += text + "\n\n"
-
- if self.footnotes :
-
- buffer += "--------\n\n"
- for cite, body in self.footnotes :
- buffer += "[^%s]: %s\n\n" % (cite, body)
-
-
- return self.compressCodeBlocks(buffer)
-
-
- def textToString(self, element) :
-
- buffer = u""
-
- for node in element.childNodes :
-
- if node.nodeType == xml.dom.Node.TEXT_NODE :
- buffer += node.nodeValue
-
- elif node.nodeType == xml.dom.Node.ELEMENT_NODE :
- tag = node.tagName
-
- if tag == "text:span" :
-
- text = self.textToString(node)
-
- if not text.strip() :
- return "" # don't apply styles to white space
-
- styleName = node.getAttribute("text:style-name")
- style = self.textStyles.get(styleName, None)
-
- #print styleName, str(style)
-
- if style.fixed :
- buffer += "`" + text + "`"
- continue
-
- if style :
- if style.italic and style.bold :
- mark = "***"
- elif style.italic :
- mark = "_"
- elif style.bold :
- mark = "**"
- else :
- mark = ""
- else :
- mark = "<" + styleName + ">"
-
- buffer += "%s%s%s" % (mark, text, mark)
-
- elif tag == "text:note" :
- cite = (node.getElementsByTagName("text:note-citation")[0]
- .childNodes[0].nodeValue)
-
- body = (node.getElementsByTagName("text:note-body")[0]
- .childNodes[0])
-
- self.footnotes.append((cite, self.textToString(body)))
-
- buffer += "[^%s]" % cite
-
- elif tag in IGNORED_TAGS :
- pass
-
- elif tag == "text:s" :
- try :
- num = int(node.getAttribute("text:c"))
- buffer += " "*num
- except :
- buffer += " "
-
- elif tag == "text:tab" :
- buffer += " "
-
-
- elif tag == "text:a" :
-
- text = self.textToString(node)
- link = node.getAttribute("xlink:href")
- buffer += "[%s](%s)" % (text, link)
-
- else :
- buffer += " {" + tag + "} "
-
- return buffer
-
- def paragraphToString(self, paragraph, indent = 0) :
-
-
- style_name = paragraph.getAttribute("text:style-name")
- paraProps = self.paragraphStyles.get(style_name) #, None)
- text = self.textToString(paragraph)
-
- #print style_name
-
- if paraProps and not paraProps.code :
- text = text.strip()
-
- if paraProps.title :
- self.hasTitle = 1
- return text + "\n" + ("=" * len(text))
-
- if paraProps.headingLevel :
-
- level = paraProps.headingLevel
- if self.hasTitle : level += 1
-
- if level == 1 :
- return text + "\n" + ("=" * len(text))
- elif level == 2 :
- return text + "\n" + ("-" * len(text))
- else :
- return "#" * level + " " + text
-
- elif paraProps.code :
- lines = [" %s" % line for line in text.split("\n")]
- return "\n".join(lines)
-
- if paraProps.indented :
- return self.wrapParagraph(text, indent = indent, blockquote = True)
-
- else :
- return self.wrapParagraph(text, indent = indent)
-
-
- def wrapParagraph(self, text, indent = 0, blockquote=False) :
-
- counter = 0
- buffer = ""
- LIMIT = 50
-
- if blockquote :
- buffer += "> "
-
- for token in text.split() :
-
- if counter > LIMIT - indent :
- buffer += "\n" + " "*indent
- if blockquote :
- buffer += "> "
- counter = 0
-
- buffer += token + " "
- counter += len(token)
-
- return buffer
-
-
-
-if __name__ == "__main__" :
-
-
- odt = OpenDocumentTextFile(sys.argv[1])
-
- #print odt.fixedFonts
-
- #sys.exit(0)
- #out = open("out.txt", "wb")
-
- unicode = odt.toString()
- out_utf8 = unicode.encode("utf-8")
-
- sys.stdout.write(out_utf8)
-
- #out.write(