aboutsummaryrefslogtreecommitdiffstats
path: root/odt2txt.py
diff options
context:
space:
mode:
Diffstat (limited to 'odt2txt.py')
-rw-r--r--odt2txt.py465
1 files changed, 465 insertions, 0 deletions
diff --git a/odt2txt.py b/odt2txt.py
new file mode 100644
index 0000000..bb8fab0
--- /dev/null
+++ b/odt2txt.py
@@ -0,0 +1,465 @@
+"""
+ODT2TXT
+=======
+
+ODT2TXT convers files in Open Document Text format (ODT) into
+Markdown-formatted plain text.
+
+Writteby by [Yuri Takhteyev](http://www.freewisdom.org).
+
+Project website: http://www.freewisdom.org/projects/python-markdown/odt2txt.php
+Contact: yuri [at] freewisdom.org
+
+License: GPL 2 (http://www.gnu.org/copyleft/gpl.html) or BSD
+
+Version: 0.1 (April 7, 2006)
+
+"""
+
+
+
+import sys, zipfile, xml.dom.minidom
+
+IGNORED_TAGS = ["office:annotation"]
+
+FOOTNOTE_STYLES = ["Footnote"]
+
+
+class TextProps :
+ """ Holds properties for a text style. """
+
+ def __init__ (self):
+
+ self.italic = False
+ self.bold = False
+ self.fixed = False
+
+ def setItalic (self, value) :
+ if value == "italic" :
+ self.italic = True
+
+ def setBold (self, value) :
+ if value == "bold" :
+ self.bold = True
+
+ def setFixed (self, value) :
+ self.fixed = value
+
+ def __str__ (self) :
+
+ return "[i=%s, h=i%s, fixed=%s]" % (str(self.italic),
+ str(self.bold),
+ str(self.fixed))
+
+class ParagraphProps :
+ """ Holds properties of a paragraph style. """
+
+ def __init__ (self):
+
+ self.blockquote = False
+ self.headingLevel = 0
+ self.code = False
+ self.title = False
+ self.indented = 0
+
+ def setIndented (self, value) :
+ self.indented = value
+
+ def setHeading (self, level) :
+ self.headingLevel = level
+
+ def setTitle (self, value):
+ self.title = value
+
+ def setCode (self, value) :
+ self.code = value
+
+
+ def __str__ (self) :
+
+ return "[bq=%s, h=%d, code=%s]" % (str(self.blockquote),
+ self.headingLevel,
+ str(self.code))
+
+
+class ListProperties :
+ """ Holds properties for a list style. """
+
+ def __init__ (self):
+ self.ordered = False
+
+ def setOrdered (self, value) :
+ self.ordered = value
+
+
+
+class OpenDocumentTextFile :
+
+
+ def __init__ (self, filepath) :
+ self.footnotes = []
+ self.footnoteCounter = 0
+ self.textStyles = {"Standard" : TextProps()}
+ self.paragraphStyles = {"Standard" : ParagraphProps()}
+ self.listStyles = {}
+ self.fixedFonts = []
+ self.hasTitle = 0
+
+ self.load(filepath)
+
+
+ def processFontDeclarations (self, fontDecl) :
+ """ Extracts necessary font information from a font-declaration
+ element.
+ """
+ for fontFace in fontDecl.getElementsByTagName("style:font-face") :
+ if fontFace.getAttribute("style:font-pitch") == "fixed" :
+ self.fixedFonts.append(fontFace.getAttribute("style:name"))
+
+
+
+ def extractTextProperties (self, style, parent=None) :
+ """ Extracts text properties from a style element. """
+
+ textProps = TextProps()
+
+ if parent :
+ parentProp = self.textStyles.get(parent, None)
+ if parentProp :
+ textProp = parentProp
+
+ textPropEl = style.getElementsByTagName("style:text-properties")
+ if not textPropEl : return textProps
+
+ textPropEl = textPropEl[0]
+
+ italic = textPropEl.getAttribute("fo:font-style")
+ bold = textPropEl.getAttribute("fo:font-weight")
+
+ textProps.setItalic(italic)
+ textProps.setBold(bold)
+
+ if textPropEl.getAttribute("style:font-name") in self.fixedFonts :
+ textProps.setFixed(True)
+
+ return textProps
+
+ def extractParagraphProperties (self, style, parent=None) :
+ """ Extracts paragraph properties from a style element. """
+
+ paraProps = ParagraphProps()
+
+ name = style.getAttribute("style:name")
+
+ if name.startswith("Heading_20_") :
+ level = name[11:]
+ try :
+ level = int(level)
+ paraProps.setHeading(level)
+ except :
+ level = 0
+
+ if name == "Title" :
+ paraProps.setTitle(True)
+
+ paraPropEl = style.getElementsByTagName("style:paragraph-properties")
+ if paraPropEl :
+ paraPropEl = paraPropEl[0]
+ leftMargin = paraPropEl.getAttribute("fo:margin-left")
+ if leftMargin :
+ try :
+ leftMargin = float(leftMargin[:-2])
+ if leftMargin > 0.01 :
+ paraProps.setIndented(True)
+ except :
+ pass
+
+ textProps = self.extractTextProperties(style)
+ if textProps.fixed :
+ paraProps.setCode(True)
+
+ return paraProps
+
+
+ def processStyles(self, styleElements) :
+ """ Runs through "style" elements extracting necessary information.
+ """
+
+ for style in styleElements :
+
+ name = style.getAttribute("style:name")
+
+ if name == "Standard" : continue
+
+ family = style.getAttribute("style:family")
+ parent = style.getAttribute("style:parent-style-name")
+
+ if family == "text" :
+ self.textStyles[name] = self.extractTextProperties(style,
+ parent)
+
+ elif family == "paragraph":
+ self.paragraphStyles[name] = (
+ self.extractParagraphProperties(style,
+ parent))
+ def processListStyles (self, listStyleElements) :
+
+ for style in listStyleElements :
+ name = style.getAttribute("style:name")
+
+ prop = ListProperties()
+ if style.childNodes :
+ if ( style.childNodes[0].tagName
+ == "text:list-level-style-number" ) :
+ prop.setOrdered(True)
+
+ self.listStyles[name] = prop
+
+
+ def load(self, filepath) :
+ """ Loads an ODT file. """
+
+ zip = zipfile.ZipFile(filepath)
+
+ styles_doc = xml.dom.minidom.parseString(zip.read("styles.xml"))
+ self.processFontDeclarations(styles_doc.getElementsByTagName(
+ "office:font-face-decls")[0])
+ self.processStyles(styles_doc.getElementsByTagName("style:style"))
+ self.processListStyles(styles_doc.getElementsByTagName(
+ "text:list-style"))
+
+ self.content = xml.dom.minidom.parseString(zip.read("content.xml"))
+ self.processFontDeclarations(self.content.getElementsByTagName(
+ "office:font-face-decls")[0])
+ self.processStyles(self.content.getElementsByTagName("style:style"))
+ self.processListStyles(self.content.getElementsByTagName(
+ "text:list-style"))
+
+ def compressCodeBlocks(self, text) :
+ """ Removes extra blank lines from code blocks. """
+
+ lines = text.split("\n")
+ buffer = ""
+ numLines = len(lines)
+ for i in range(numLines) :
+
+ if (lines[i].strip() or i == numLines-1 or i == 0 or
+ not ( lines[i-1].startswith(" ")
+ and lines[i+1].startswith(" ") ) ):
+ buffer += "\n" + lines[i]
+
+ return buffer
+
+
+
+ def listToString (self, listElement) :
+
+ buffer = ""
+
+ styleName = listElement.getAttribute("text:style-name")
+ props = self.listStyles.get(styleName, ListProperties())
+
+
+
+ i = 0
+ for item in listElement.childNodes :
+ i += 1
+ if props.ordered :
+ number = str(i)
+ number = number + "." + " "*(2-len(number))
+ buffer += number + self.paragraphToString(item.childNodes[0],
+ indent=3)
+ else :
+ buffer += "* " + self.paragraphToString(item.childNodes[0],
+ indent=2)
+ buffer += "\n\n"
+
+ return buffer
+
+ def toString (self) :
+ """ Converts the document to a string. """
+ body = self.content.getElementsByTagName("office:body")[0]
+ text = self.content.getElementsByTagName("office:text")[0]
+
+ buffer = u""
+
+
+ paragraphs = [el for el in text.childNodes
+ if el.tagName in ["text:p", "text:h",
+ "text:list"]]
+
+ for paragraph in paragraphs :
+ if paragraph.tagName == "text:list" :
+ text = self.listToString(paragraph)
+ else :
+ text = self.paragraphToString(paragraph)
+ if text :
+ buffer += text + "\n\n"
+
+ if self.footnotes :
+
+ buffer += "--------\n\n"
+ for cite, body in self.footnotes :
+ buffer += "[^%s]: %s\n\n" % (cite, body)
+
+
+ return self.compressCodeBlocks(buffer)
+
+
+ def textToString(self, element) :
+
+ buffer = u""
+
+ for node in element.childNodes :
+
+ if node.nodeType == xml.dom.Node.TEXT_NODE :
+ buffer += node.nodeValue
+
+ elif node.nodeType == xml.dom.Node.ELEMENT_NODE :
+ tag = node.tagName
+
+ if tag == "text:span" :
+
+ text = self.textToString(node)
+
+ if not text.strip() :
+ return "" # don't apply styles to white space
+
+ styleName = node.getAttribute("text:style-name")
+ style = self.textStyles.get(styleName, None)
+
+ #print styleName, str(style)
+
+ if style.fixed :
+ buffer += "`" + text + "`"
+ continue
+
+ if style :
+ if style.italic and style.bold :
+ mark = "***"
+ elif style.italic :
+ mark = "_"
+ elif style.bold :
+ mark = "**"
+ else :
+ mark = ""
+ else :
+ mark = "<" + styleName + ">"
+
+ buffer += "%s%s%s" % (mark, text, mark)
+
+ elif tag == "text:note" :
+ cite = (node.getElementsByTagName("text:note-citation")[0]
+ .childNodes[0].nodeValue)
+
+ body = (node.getElementsByTagName("text:note-body")[0]
+ .childNodes[0])
+
+ self.footnotes.append((cite, self.textToString(body)))
+
+ buffer += "[^%s]" % cite
+
+ elif tag in IGNORED_TAGS :
+ pass
+
+ elif tag == "text:s" :
+ try :
+ num = int(node.getAttribute("text:c"))
+ buffer += " "*num
+ except :
+ buffer += " "
+
+ elif tag == "text:tab" :
+ buffer += " "
+
+
+ elif tag == "text:a" :
+
+ text = self.textToString(node)
+ link = node.getAttribute("xlink:href")
+ buffer += "[%s](%s)" % (text, link)
+
+ else :
+ buffer += " {" + tag + "} "
+
+ return buffer
+
+ def paragraphToString(self, paragraph, indent = 0) :
+
+
+ style_name = paragraph.getAttribute("text:style-name")
+ paraProps = self.paragraphStyles.get(style_name) #, None)
+ text = self.textToString(paragraph)
+
+ #print style_name
+
+ if paraProps and not paraProps.code :
+ text = text.strip()
+
+ if paraProps.title :
+ self.hasTitle = 1
+ return text + "\n" + ("=" * len(text))
+
+ if paraProps.headingLevel :
+
+ level = paraProps.headingLevel
+ if self.hasTitle : level += 1
+
+ if level == 1 :
+ return text + "\n" + ("=" * len(text))
+ elif level == 2 :
+ return text + "\n" + ("-" * len(text))
+ else :
+ return "#" * level + " " + text
+
+ elif paraProps.code :
+ lines = [" %s" % line for line in text.split("\n")]
+ return "\n".join(lines)
+
+ if paraProps.indented :
+ return self.wrapParagraph(text, indent = indent, blockquote = True)
+
+ else :
+ return self.wrapParagraph(text, indent = indent)
+
+
+ def wrapParagraph(self, text, indent = 0, blockquote=False) :
+
+ counter = 0
+ buffer = ""
+ LIMIT = 50
+
+ if blockquote :
+ buffer += "> "
+
+ for token in text.split() :
+
+ if counter > LIMIT - indent :
+ buffer += "\n" + " "*indent
+ if blockquote :
+ buffer += "> "
+ counter = 0
+
+ buffer += token + " "
+ counter += len(token)
+
+ return buffer
+
+
+
+if __name__ == "__main__" :
+
+
+ odt = OpenDocumentTextFile(sys.argv[1])
+
+ #print odt.fixedFonts
+
+ #sys.exit(0)
+ #out = open("out.txt", "wb")
+
+ unicode = odt.toString()
+ out_utf8 = unicode.encode("utf-8")
+
+ sys.stdout.write(out_utf8)
+
+ #out.write(