""" ODT2TXT ======= ODT2TXT convers files in Open Document Text format (ODT) into Markdown-formatted plain text. Writteby by [Yuri Takhteyev](http://www.freewisdom.org). Project website: http://www.freewisdom.org/projects/python-markdown/odt2txt.php Contact: yuri [at] freewisdom.org License: GPL 2 (http://www.gnu.org/copyleft/gpl.html) or BSD Version: 0.1 (April 7, 2006) """ import sys, zipfile, xml.dom.minidom IGNORED_TAGS = ["office:annotation"] FOOTNOTE_STYLES = ["Footnote"] class TextProps : """ Holds properties for a text style. """ def __init__ (self): self.italic = False self.bold = False self.fixed = False def setItalic (self, value) : if value == "italic" : self.italic = True def setBold (self, value) : if value == "bold" : self.bold = True def setFixed (self, value) : self.fixed = value def __str__ (self) : return "[i=%s, h=i%s, fixed=%s]" % (str(self.italic), str(self.bold), str(self.fixed)) class ParagraphProps : """ Holds properties of a paragraph style. """ def __init__ (self): self.blockquote = False self.headingLevel = 0 self.code = False self.title = False self.indented = 0 def setIndented (self, value) : self.indented = value def setHeading (self, level) : self.headingLevel = level def setTitle (self, value): self.title = value def setCode (self, value) : self.code = value def __str__ (self) : return "[bq=%s, h=%d, code=%s]" % (str(self.blockquote), self.headingLevel, str(self.code)) class ListProperties : """ Holds properties for a list style. """ def __init__ (self): self.ordered = False def setOrdered (self, value) : self.ordered = value class OpenDocumentTextFile : def __init__ (self, filepath) : self.footnotes = [] self.footnoteCounter = 0 self.textStyles = {"Standard" : TextProps()} self.paragraphStyles = {"Standard" : ParagraphProps()} self.listStyles = {} self.fixedFonts = [] self.hasTitle = 0 self.load(filepath) def processFontDeclarations (self, fontDecl) : """ Extracts necessary font information from a font-declaration element. """ for fontFace in fontDecl.getElementsByTagName("style:font-face") : if fontFace.getAttribute("style:font-pitch") == "fixed" : self.fixedFonts.append(fontFace.getAttribute("style:name")) def extractTextProperties (self, style, parent=None) : """ Extracts text properties from a style element. """ textProps = TextProps() if parent : parentProp = self.textStyles.get(parent, None) if parentProp : textProp = parentProp textPropEl = style.getElementsByTagName("style:text-properties") if not textPropEl : return textProps textPropEl = textPropEl[0] italic = textPropEl.getAttribute("fo:font-style") bold = textPropEl.getAttribute("fo:font-weight") textProps.setItalic(italic) textProps.setBold(bold) if textPropEl.getAttribute("style:font-name") in self.fixedFonts : textProps.setFixed(True) return textProps def extractParagraphProperties (self, style, parent=None) : """ Extracts paragraph properties from a style element. """ paraProps = ParagraphProps() name = style.getAttribute("style:name") if name.startswith("Heading_20_") : level = name[11:] try : level = int(level) paraProps.setHeading(level) except : level = 0 if name == "Title" : paraProps.setTitle(True) paraPropEl = style.getElementsByTagName("style:paragraph-properties") if paraPropEl : paraPropEl = paraPropEl[0] leftMargin = paraPropEl.getAttribute("fo:margin-left") if leftMargin : try : leftMargin = float(leftMargin[:-2]) if leftMargin > 0.01 : paraProps.setIndented(True) except : pass textProps = self.extractTextProperties(style) if textProps.fixed : paraProps.setCode(True) return paraProps def processStyles(self, styleElements) : """ Runs through "style" elements extracting necessary information. """ for style in styleElements : name = style.getAttribute("style:name") if name == "Standard" : continue family = style.getAttribute("style:family") parent = style.getAttribute("style:parent-style-name") if family == "text" : self.textStyles[name] = self.extractTextProperties(style, parent) elif family == "paragraph": self.paragraphStyles[name] = ( self.extractParagraphProperties(style, parent)) def processListStyles (self, listStyleElements) : for style in listStyleElements : name = style.getAttribute("style:name") prop = ListProperties() if style.childNodes : if ( style.childNodes[0].tagName == "text:list-level-style-number" ) : prop.setOrdered(True) self.listStyles[name] = prop def load(self, filepath) : """ Loads an ODT file. """ zip = zipfile.ZipFile(filepath) styles_doc = xml.dom.minidom.parseString(zip.read("styles.xml")) self.processFontDeclarations(styles_doc.getElementsByTagName( "office:font-face-decls")[0]) self.processStyles(styles_doc.getElementsByTagName("style:style")) self.processListStyles(styles_doc.getElementsByTagName( "text:list-style")) self.content = xml.dom.minidom.parseString(zip.read("content.xml")) self.processFontDeclarations(self.content.getElementsByTagName( "office:font-face-decls")[0]) self.processStyles(self.content.getElementsByTagName("style:style")) self.processListStyles(self.content.getElementsByTagName( "text:list-style")) def compressCodeBlocks(self, text) : """ Removes extra blank lines from code blocks. """ lines = text.split("\n") buffer = "" numLines = len(lines) for i in range(numLines) : if (lines[i].strip() or i == numLines-1 or i == 0 or not ( lines[i-1].startswith(" ") and lines[i+1].startswith(" ") ) ): buffer += "\n" + lines[i] return buffer def listToString (self, listElement) : buffer = "" styleName = listElement.getAttribute("text:style-name") props = self.listStyles.get(styleName, ListProperties()) i = 0 for item in listElement.childNodes : i += 1 if props.ordered : number = str(i) number = number + "." + " "*(2-len(number)) buffer += number + self.paragraphToString(item.childNodes[0], indent=3) else : buffer += "* " + self.paragraphToString(item.childNodes[0], indent=2) buffer += "\n\n" return buffer def toString (self) : """ Converts the document to a string. """ body = self.content.getElementsByTagName("office:body")[0] text = self.content.getElementsByTagName("office:text")[0] buffer = u"" paragraphs = [el for el in text.childNodes if el.tagName in ["text:p", "text:h", "text:list"]] for paragraph in paragraphs : if paragraph.tagName == "text:list" : text = self.listToString(paragraph) else : text = self.paragraphToString(paragraph) if text : buffer += text + "\n\n" if self.footnotes : buffer += "--------\n\n" for cite, body in self.footnotes : buffer += "[^%s]: %s\n\n" % (cite, body) return self.compressCodeBlocks(buffer) def textToString(self, element) : buffer = u"" for node in element.childNodes : if node.nodeType == xml.dom.Node.TEXT_NODE : buffer += node.nodeValue elif node.nodeType == xml.dom.Node.ELEMENT_NODE : tag = node.tagName if tag == "text:span" : text = self.textToString(node) if not text.strip() : return "" # don't apply styles to white space styleName = node.getAttribute("text:style-name") style = self.textStyles.get(styleName, None) #print styleName, str(style) if style.fixed : buffer += "`" + text + "`" continue if style : if style.italic and style.bold : mark = "***" elif style.italic : mark = "_" elif style.bold : mark = "**" else : mark = "" else : mark = "<" + styleName + ">" buffer += "%s%s%s" % (mark, text, mark) elif tag == "text:note" : cite = (node.getElementsByTagName("text:note-citation")[0] .childNodes[0].nodeValue) body = (node.getElementsByTagName("text:note-body")[0] .childNodes[0]) self.footnotes.append((cite, self.textToString(body))) buffer += "[^%s]" % cite elif tag in IGNORED_TAGS : pass elif tag == "text:s" : try : num = int(node.getAttribute("text:c")) buffer += " "*num except : buffer += " " elif tag == "text:tab" : buffer += " " elif tag == "text:a" : text = self.textToString(node) link = node.getAttribute("xlink:href") buffer += "[%s](%s)" % (text, link) else : buffer += " {" + tag + "} " return buffer def paragraphToString(self, paragraph, indent = 0) : style_name = paragraph.getAttribute("text:style-name") paraProps = self.paragraphStyles.get(style_name) #, None) text = self.textToString(paragraph) #print style_name if paraProps and not paraProps.code : text = text.strip() if paraProps.title : self.hasTitle = 1 return text + "\n" + ("=" * len(text)) if paraProps.headingLevel : level = paraProps.headingLevel if self.hasTitle : level += 1 if level == 1 : return text + "\n" + ("=" * len(text)) elif level == 2 : return text + "\n" + ("-" * len(text)) else : return "#" * level + " " + text elif paraProps.code : lines = [" %s" % line for line in text.split("\n")] return "\n".join(lines) if paraProps.indented : return self.wrapParagraph(text, indent = indent, blockquote = True) else : return self.wrapParagraph(text, indent = indent) def wrapParagraph(self, text, indent = 0, blockquote=False) : counter = 0 buffer = "" LIMIT = 50 if blockquote : buffer += "> " for token in text.split() : if counter > LIMIT - indent : buffer += "\n" + " "*indent if blockquote : buffer += "> " counter = 0 buffer += token + " " counter += len(token) return buffer if __name__ == "__main__" : odt = OpenDocumentTextFile(sys.argv[1]) #print odt.fixedFonts #sys.exit(0) #out = open("out.txt", "wb") unicode = odt.toString() out_utf8 = unicode.encode("utf-8") sys.stdout.write(out_utf8) #out.write(