[IMP] document: indexation of MS documents

Commit 3ced0ff6 removed the support of Microsoft documents for indexation. It makes sense for the old formats such as '.doc' since it requires an external tool ('antiword'), which could lead to a security issue. However, the new formats such '.docx' are simple xml files, therefore they could be indexed with the usual XML parsing tools. opw-677235

[IMP] document: indexation of MS documents
ccb6f1c5 · Nicolas Martinelli · dcab356b · ccb6f1c5
Commit ccb6f1c5 authored 8 years ago by Nicolas Martinelli
--- a/addons/document/models/ir_attachment.py
+++ b/addons/document/models/ir_attachment.py
@@ -7,34 +7,105 @@ from StringIO import StringIO
 import pyPdf
-import openerp
+from openerp.osv import osv
-from openerp.osv import fields, osv
 _logger = logging.getLogger(__name__)
+FTYPES = ['docx', 'pptx', 'xlsx', 'opendoc', 'pdf']
+# Keep function in case it is necessary to do toUnicode(buf.encode('ascii', 'replace'))
+def toUnicode(s):
+    try:
+        return s.decode('utf-8')
+    except UnicodeError:
+        try:
+            return s.decode('latin')
+        except UnicodeError:
+            try:
+                return s.encode('ascii')
+            except UnicodeError:
+                return s
+def textToString(element):
+    buff = u""
+    for node in element.childNodes:
+        if node.nodeType == xml.dom.Node.TEXT_NODE:
+            buff += node.nodeValue
+        elif node.nodeType == xml.dom.Node.ELEMENT_NODE:
+            buff += textToString(node)
+    return buff
 class IrAttachment(osv.osv):
    _inherit = 'ir.attachment'
-    def _index_odt(self, bin_data):
+    def _index_docx(self, bin_data):
+        '''Index Microsoft .docx documents'''
+        buf = u""
+        f = StringIO(bin_data)
+        if zipfile.is_zipfile(f):
+            try:
+                zf = zipfile.ZipFile(f)
+                content = xml.dom.minidom.parseString(zf.read("word/document.xml"))
+                for val in ["w:p", "w:h", "text:list"]:
+                    for element in content.getElementsByTagName(val):
+                        buf += textToString(element) + "\n"
+            except Exception:
+                pass
+        return buf
+    def _index_pptx(self, bin_data):
+        '''Index Microsoft .pptx documents'''
+        buf = u""
+        f = StringIO(bin_data)
+        if zipfile.is_zipfile(f):
+            try:
+                zf = zipfile.ZipFile(f)
+                zf_filelist = [x for x in zf.namelist() if x.startswith('ppt/slides/slide')]
+                for i in range(1, len(zf_filelist) + 1):
+                    content = xml.dom.minidom.parseString(zf.read('ppt/slides/slide%s.xml' % i))
+                    for val in ["a:t"]:
+                        for element in content.getElementsByTagName(val):
+                            buf += textToString(element) + "\n"
+            except Exception:
+                pass
+        return buf
+    def _index_xlsx(self, bin_data):
+        '''Index Microsoft .xlsx documents'''
+        buf = u""
+        f = StringIO(bin_data)
+        if zipfile.is_zipfile(f):
+            try:
+                zf = zipfile.ZipFile(f)
+                content = xml.dom.minidom.parseString(zf.read("xl/sharedStrings.xml"))
+                for val in ["t"]:
+                    for element in content.getElementsByTagName(val):
+                        buf += textToString(element) + "\n"
+            except Exception:
+                pass
+        return buf
+    def _index_opendoc(self, bin_data):
+        '''Index OpenDocument documents (.odt, .ods...)'''
        buf = u""
        f = StringIO(bin_data)
        if zipfile.is_zipfile(f):
            try:
                zf = zipfile.ZipFile(f)
-                self.content = xml.dom.minidom.parseString(zf.read("content.xml"))
+                content = xml.dom.minidom.parseString(zf.read("content.xml"))
                for val in ["text:p", "text:h", "text:list"]:
-                    for element in self.content.getElementsByTagName(val) :
+                    for element in content.getElementsByTagName(val):
-                        for node in element.childNodes :
+                        buf += textToString(element) + "\n"
-                            if node.nodeType == xml.dom.Node.TEXT_NODE :
-                                buf += node.nodeValue
-                            elif node.nodeType == xml.dom.Node.ELEMENT_NODE :
-                                buf += self.textToString(node)
-                        buf += "\n"
            except Exception:
                pass
        return buf
    def _index_pdf(self, bin_data):
+        '''Index PDF documents'''
        buf = u""
        if bin_data.startswith('%PDF-'):
            f = StringIO(bin_data)
@@ -47,14 +118,9 @@ class IrAttachment(osv.osv):
        return buf
    def _index(self, cr, uid, bin_data, datas_fname, mimetype):
-        # try to index odt content
+        for ftype in FTYPES:
-        buf = self._index_odt(bin_data)
+            buf = getattr(self, '_index_%s' % ftype)(bin_data)
-        if buf:
+            if buf:
-            return buf
+                return buf
-        # try to index pdf content
-        buf = self._index_pdf(bin_data)
-        if buf:
-            return buf
        return super(IrAttachment, self)._index(cr, uid, bin_data, datas_fname, mimetype)