Skip to content
Snippets Groups Projects
Commit ccb6f1c5 authored by Nicolas Martinelli's avatar Nicolas Martinelli
Browse files

[IMP] document: indexation of MS documents

Commit 3ced0ff6 removed the support of Microsoft documents for
indexation. It makes sense for the old formats such as '.doc' since it
requires an external tool ('antiword'), which could lead to a security
issue. However, the new formats such '.docx' are simple xml files,
therefore they could be indexed with the usual XML parsing tools.

opw-677235
parent dcab356b
No related branches found
No related tags found
No related merge requests found
......@@ -7,34 +7,105 @@ from StringIO import StringIO
import pyPdf
import openerp
from openerp.osv import fields, osv
from openerp.osv import osv
_logger = logging.getLogger(__name__)
FTYPES = ['docx', 'pptx', 'xlsx', 'opendoc', 'pdf']
# Keep function in case it is necessary to do toUnicode(buf.encode('ascii', 'replace'))
def toUnicode(s):
try:
return s.decode('utf-8')
except UnicodeError:
try:
return s.decode('latin')
except UnicodeError:
try:
return s.encode('ascii')
except UnicodeError:
return s
def textToString(element):
buff = u""
for node in element.childNodes:
if node.nodeType == xml.dom.Node.TEXT_NODE:
buff += node.nodeValue
elif node.nodeType == xml.dom.Node.ELEMENT_NODE:
buff += textToString(node)
return buff
class IrAttachment(osv.osv):
_inherit = 'ir.attachment'
def _index_odt(self, bin_data):
def _index_docx(self, bin_data):
'''Index Microsoft .docx documents'''
buf = u""
f = StringIO(bin_data)
if zipfile.is_zipfile(f):
try:
zf = zipfile.ZipFile(f)
content = xml.dom.minidom.parseString(zf.read("word/document.xml"))
for val in ["w:p", "w:h", "text:list"]:
for element in content.getElementsByTagName(val):
buf += textToString(element) + "\n"
except Exception:
pass
return buf
def _index_pptx(self, bin_data):
'''Index Microsoft .pptx documents'''
buf = u""
f = StringIO(bin_data)
if zipfile.is_zipfile(f):
try:
zf = zipfile.ZipFile(f)
zf_filelist = [x for x in zf.namelist() if x.startswith('ppt/slides/slide')]
for i in range(1, len(zf_filelist) + 1):
content = xml.dom.minidom.parseString(zf.read('ppt/slides/slide%s.xml' % i))
for val in ["a:t"]:
for element in content.getElementsByTagName(val):
buf += textToString(element) + "\n"
except Exception:
pass
return buf
def _index_xlsx(self, bin_data):
'''Index Microsoft .xlsx documents'''
buf = u""
f = StringIO(bin_data)
if zipfile.is_zipfile(f):
try:
zf = zipfile.ZipFile(f)
content = xml.dom.minidom.parseString(zf.read("xl/sharedStrings.xml"))
for val in ["t"]:
for element in content.getElementsByTagName(val):
buf += textToString(element) + "\n"
except Exception:
pass
return buf
def _index_opendoc(self, bin_data):
'''Index OpenDocument documents (.odt, .ods...)'''
buf = u""
f = StringIO(bin_data)
if zipfile.is_zipfile(f):
try:
zf = zipfile.ZipFile(f)
self.content = xml.dom.minidom.parseString(zf.read("content.xml"))
content = xml.dom.minidom.parseString(zf.read("content.xml"))
for val in ["text:p", "text:h", "text:list"]:
for element in self.content.getElementsByTagName(val) :
for node in element.childNodes :
if node.nodeType == xml.dom.Node.TEXT_NODE :
buf += node.nodeValue
elif node.nodeType == xml.dom.Node.ELEMENT_NODE :
buf += self.textToString(node)
buf += "\n"
for element in content.getElementsByTagName(val):
buf += textToString(element) + "\n"
except Exception:
pass
return buf
def _index_pdf(self, bin_data):
'''Index PDF documents'''
buf = u""
if bin_data.startswith('%PDF-'):
f = StringIO(bin_data)
......@@ -47,14 +118,9 @@ class IrAttachment(osv.osv):
return buf
def _index(self, cr, uid, bin_data, datas_fname, mimetype):
# try to index odt content
buf = self._index_odt(bin_data)
if buf:
return buf
# try to index pdf content
buf = self._index_pdf(bin_data)
if buf:
return buf
for ftype in FTYPES:
buf = getattr(self, '_index_%s' % ftype)(bin_data)
if buf:
return buf
return super(IrAttachment, self)._index(cr, uid, bin_data, datas_fname, mimetype)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment