[IMP] document: indexation of MS documents

Commit 3ced0ff6 removed the support of Microsoft documents for indexation. It makes sense for the old formats such as '.doc' since it requires an external tool ('antiword'), which could lead to a security issue. However, the new formats such '.docx' are simple xml files, therefore they could be indexed with the usual XML parsing tools. opw-677235

[IMP] document: indexation of MS documents
ccb6f1c5 · Nicolas Martinelli · dcab356b · ccb6f1c5
Commit ccb6f1c5 authored 8 years ago by Nicolas Martinelli
--- a/addons/document/models/ir_attachment.py
+++ b/addons/document/models/ir_attachment.py
@@ -7,34 +7,105 @@ from StringIO import StringIO

 import pyPdf

-import openerp
-from openerp.osv import fields, osv
+from openerp.osv import osv

 _logger = logging.getLogger(__name__)
+FTYPES = ['docx', 'pptx', 'xlsx', 'opendoc', 'pdf']
+
+# Keep function in case it is necessary to do toUnicode(buf.encode('ascii', 'replace'))
+def toUnicode(s):
+    try:
+        return s.decode('utf-8')
+    except UnicodeError:
+        try:
+            return s.decode('latin')
+        except UnicodeError:
+            try:
+                return s.encode('ascii')
+            except UnicodeError:
+                return s
+
+def textToString(element):
+    buff = u""
+    for node in element.childNodes:
+        if node.nodeType == xml.dom.Node.TEXT_NODE:
+            buff += node.nodeValue
+        elif node.nodeType == xml.dom.Node.ELEMENT_NODE:
+            buff += textToString(node)
+    return buff
+

 class IrAttachment(osv.osv):
    _inherit = 'ir.attachment'

-    def _index_odt(self, bin_data):
+    def _index_docx(self, bin_data):
+        '''Index Microsoft .docx documents'''
+        buf = u""
+        f = StringIO(bin_data)
+        if zipfile.is_zipfile(f):
+            try:
+                zf = zipfile.ZipFile(f)
+                content = xml.dom.minidom.parseString(zf.read("word/document.xml"))
+                for val in ["w:p", "w:h", "text:list"]:
+                    for element in content.getElementsByTagName(val):
+                        buf += textToString(element) + "\n"
+            except Exception:
+                pass
+        return buf
+
+    def _index_pptx(self, bin_data):
+        '''Index Microsoft .pptx documents'''
+
+        buf = u""
+        f = StringIO(bin_data)
+        if zipfile.is_zipfile(f):
+            try:
+                zf = zipfile.ZipFile(f)
+                zf_filelist = [x for x in zf.namelist() if x.startswith('ppt/slides/slide')]
+                for i in range(1, len(zf_filelist) + 1):
+                    content = xml.dom.minidom.parseString(zf.read('ppt/slides/slide%s.xml' % i))
+                    for val in ["a:t"]:
+                        for element in content.getElementsByTagName(val):
+                            buf += textToString(element) + "\n"
+            except Exception:
+                pass
+        return buf
+
+    def _index_xlsx(self, bin_data):
+        '''Index Microsoft .xlsx documents'''
+
+        buf = u""
+        f = StringIO(bin_data)
+        if zipfile.is_zipfile(f):
+            try:
+                zf = zipfile.ZipFile(f)
+                content = xml.dom.minidom.parseString(zf.read("xl/sharedStrings.xml"))
+                for val in ["t"]:
+                    for element in content.getElementsByTagName(val):
+                        buf += textToString(element) + "\n"
+            except Exception:
+                pass
+        return buf
+
+    def _index_opendoc(self, bin_data):
+        '''Index OpenDocument documents (.odt, .ods...)'''
+
        buf = u""
        f = StringIO(bin_data)
        if zipfile.is_zipfile(f):
            try:
                zf = zipfile.ZipFile(f)
-                self.content = xml.dom.minidom.parseString(zf.read("content.xml"))
+                content = xml.dom.minidom.parseString(zf.read("content.xml"))
                for val in ["text:p", "text:h", "text:list"]:
-                    for element in self.content.getElementsByTagName(val) :
-                        for node in element.childNodes :
-                            if node.nodeType == xml.dom.Node.TEXT_NODE :
-                                buf += node.nodeValue
-                            elif node.nodeType == xml.dom.Node.ELEMENT_NODE :
-                                buf += self.textToString(node)
-                        buf += "\n"
+                    for element in content.getElementsByTagName(val):
+                        buf += textToString(element) + "\n"
            except Exception:
                pass
        return buf

    def _index_pdf(self, bin_data):
+        '''Index PDF documents'''
+
        buf = u""
        if bin_data.startswith('%PDF-'):
            f = StringIO(bin_data)
@@ -47,14 +118,9 @@ class IrAttachment(osv.osv):
        return buf

    def _index(self, cr, uid, bin_data, datas_fname, mimetype):
-        # try to index odt content
-        buf = self._index_odt(bin_data)
-        if buf:
-            return buf
-        # try to index pdf content
-        buf = self._index_pdf(bin_data)
-        if buf:
-            return buf
+        for ftype in FTYPES:
+            buf = getattr(self, '_index_%s' % ftype)(bin_data)
+            if buf:
+                return buf

        return super(IrAttachment, self)._index(cr, uid, bin_data, datas_fname, mimetype)
-