Skip to content
Snippets Groups Projects
Commit ccb6f1c5 authored by Nicolas Martinelli's avatar Nicolas Martinelli
Browse files

[IMP] document: indexation of MS documents

Commit 3ced0ff6 removed the support of Microsoft documents for
indexation. It makes sense for the old formats such as '.doc' since it
requires an external tool ('antiword'), which could lead to a security
issue. However, the new formats such '.docx' are simple xml files,
therefore they could be indexed with the usual XML parsing tools.

opw-677235
parent dcab356b
No related branches found
No related tags found
No related merge requests found
...@@ -7,34 +7,105 @@ from StringIO import StringIO ...@@ -7,34 +7,105 @@ from StringIO import StringIO
import pyPdf import pyPdf
import openerp from openerp.osv import osv
from openerp.osv import fields, osv
_logger = logging.getLogger(__name__) _logger = logging.getLogger(__name__)
FTYPES = ['docx', 'pptx', 'xlsx', 'opendoc', 'pdf']
# Keep function in case it is necessary to do toUnicode(buf.encode('ascii', 'replace'))
def toUnicode(s):
try:
return s.decode('utf-8')
except UnicodeError:
try:
return s.decode('latin')
except UnicodeError:
try:
return s.encode('ascii')
except UnicodeError:
return s
def textToString(element):
buff = u""
for node in element.childNodes:
if node.nodeType == xml.dom.Node.TEXT_NODE:
buff += node.nodeValue
elif node.nodeType == xml.dom.Node.ELEMENT_NODE:
buff += textToString(node)
return buff
class IrAttachment(osv.osv): class IrAttachment(osv.osv):
_inherit = 'ir.attachment' _inherit = 'ir.attachment'
def _index_odt(self, bin_data): def _index_docx(self, bin_data):
'''Index Microsoft .docx documents'''
buf = u""
f = StringIO(bin_data)
if zipfile.is_zipfile(f):
try:
zf = zipfile.ZipFile(f)
content = xml.dom.minidom.parseString(zf.read("word/document.xml"))
for val in ["w:p", "w:h", "text:list"]:
for element in content.getElementsByTagName(val):
buf += textToString(element) + "\n"
except Exception:
pass
return buf
def _index_pptx(self, bin_data):
'''Index Microsoft .pptx documents'''
buf = u""
f = StringIO(bin_data)
if zipfile.is_zipfile(f):
try:
zf = zipfile.ZipFile(f)
zf_filelist = [x for x in zf.namelist() if x.startswith('ppt/slides/slide')]
for i in range(1, len(zf_filelist) + 1):
content = xml.dom.minidom.parseString(zf.read('ppt/slides/slide%s.xml' % i))
for val in ["a:t"]:
for element in content.getElementsByTagName(val):
buf += textToString(element) + "\n"
except Exception:
pass
return buf
def _index_xlsx(self, bin_data):
'''Index Microsoft .xlsx documents'''
buf = u""
f = StringIO(bin_data)
if zipfile.is_zipfile(f):
try:
zf = zipfile.ZipFile(f)
content = xml.dom.minidom.parseString(zf.read("xl/sharedStrings.xml"))
for val in ["t"]:
for element in content.getElementsByTagName(val):
buf += textToString(element) + "\n"
except Exception:
pass
return buf
def _index_opendoc(self, bin_data):
'''Index OpenDocument documents (.odt, .ods...)'''
buf = u"" buf = u""
f = StringIO(bin_data) f = StringIO(bin_data)
if zipfile.is_zipfile(f): if zipfile.is_zipfile(f):
try: try:
zf = zipfile.ZipFile(f) zf = zipfile.ZipFile(f)
self.content = xml.dom.minidom.parseString(zf.read("content.xml")) content = xml.dom.minidom.parseString(zf.read("content.xml"))
for val in ["text:p", "text:h", "text:list"]: for val in ["text:p", "text:h", "text:list"]:
for element in self.content.getElementsByTagName(val) : for element in content.getElementsByTagName(val):
for node in element.childNodes : buf += textToString(element) + "\n"
if node.nodeType == xml.dom.Node.TEXT_NODE :
buf += node.nodeValue
elif node.nodeType == xml.dom.Node.ELEMENT_NODE :
buf += self.textToString(node)
buf += "\n"
except Exception: except Exception:
pass pass
return buf return buf
def _index_pdf(self, bin_data): def _index_pdf(self, bin_data):
'''Index PDF documents'''
buf = u"" buf = u""
if bin_data.startswith('%PDF-'): if bin_data.startswith('%PDF-'):
f = StringIO(bin_data) f = StringIO(bin_data)
...@@ -47,14 +118,9 @@ class IrAttachment(osv.osv): ...@@ -47,14 +118,9 @@ class IrAttachment(osv.osv):
return buf return buf
def _index(self, cr, uid, bin_data, datas_fname, mimetype): def _index(self, cr, uid, bin_data, datas_fname, mimetype):
# try to index odt content for ftype in FTYPES:
buf = self._index_odt(bin_data) buf = getattr(self, '_index_%s' % ftype)(bin_data)
if buf: if buf:
return buf return buf
# try to index pdf content
buf = self._index_pdf(bin_data)
if buf:
return buf
return super(IrAttachment, self)._index(cr, uid, bin_data, datas_fname, mimetype) return super(IrAttachment, self)._index(cr, uid, bin_data, datas_fname, mimetype)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment