|
a/src/filters/rclpdf.py |
|
b/src/filters/rclpdf.py |
|
... |
|
... |
76 |
|
76 |
|
77 |
class PDFExtractor:
|
77 |
class PDFExtractor:
|
78 |
def __init__(self, em):
|
78 |
def __init__(self, em):
|
79 |
self.currentindex = 0
|
79 |
self.currentindex = 0
|
80 |
self.pdftotext = None
|
80 |
self.pdftotext = None
|
|
|
81 |
self.pdfinfo = None
|
|
|
82 |
self.pdftk = None
|
81 |
self.em = em
|
83 |
self.em = em
|
82 |
|
84 |
self.tesseract = None
|
83 |
self.confdir = rclconfig.RclConfig().getConfDir()
|
|
|
84 |
cf_doocr = rclconfig.RclConfig().getConfParam("pdfocr")
|
|
|
85 |
cf_attach = rclconfig.RclConfig().getConfParam("pdfattach")
|
|
|
86 |
|
85 |
|
87 |
self.pdftotext = rclexecm.which("pdftotext")
|
86 |
self.pdftotext = rclexecm.which("pdftotext")
|
88 |
if not self.pdftotext:
|
87 |
if not self.pdftotext:
|
89 |
self.pdftotext = rclexecm.which("poppler/pdftotext")
|
88 |
self.pdftotext = rclexecm.which("poppler/pdftotext")
|
|
|
89 |
# No need for anything else. openfile() will return an
|
|
|
90 |
# error at once
|
|
|
91 |
return
|
|
|
92 |
|
|
|
93 |
cf = rclconfig.RclConfig()
|
|
|
94 |
self.confdir = cf.getConfDir()
|
|
|
95 |
|
|
|
96 |
# The user can set a list of meta tags to be extracted from
|
|
|
97 |
# the XMP metadata packet. These are specified as
|
|
|
98 |
# (xmltag,rcltag) pairs
|
|
|
99 |
self.extrameta = cf.getConfParam("pdfextrameta")
|
|
|
100 |
if self.extrameta:
|
|
|
101 |
self._initextrameta()
|
90 |
|
102 |
|
91 |
# Check if we need to escape portions of text where old
|
103 |
# Check if we need to escape portions of text where old
|
92 |
# versions of pdftotext output raw HTML special characters.
|
104 |
# versions of pdftotext output raw HTML special characters.
|
93 |
self.needescape = True
|
105 |
self.needescape = True
|
94 |
try:
|
106 |
try:
|
|
... |
|
... |
104 |
|
116 |
|
105 |
# See if we'll try to perform OCR. Need the commands and the
|
117 |
# See if we'll try to perform OCR. Need the commands and the
|
106 |
# either the presence of a file in the config dir (historical)
|
118 |
# either the presence of a file in the config dir (historical)
|
107 |
# or a set config variable.
|
119 |
# or a set config variable.
|
108 |
self.ocrpossible = False
|
120 |
self.ocrpossible = False
|
|
|
121 |
cf_doocr = cf.getConfParam("pdfocr")
|
109 |
if cf_doocr or os.path.isfile(os.path.join(self.confdir, "ocrpdf")):
|
122 |
if cf_doocr or os.path.isfile(os.path.join(self.confdir, "ocrpdf")):
|
110 |
self.tesseract = rclexecm.which("tesseract")
|
123 |
self.tesseract = rclexecm.which("tesseract")
|
111 |
if self.tesseract:
|
124 |
if self.tesseract:
|
112 |
self.pdftoppm = rclexecm.which("pdftoppm")
|
125 |
self.pdftoppm = rclexecm.which("pdftoppm")
|
113 |
if self.pdftoppm:
|
126 |
if self.pdftoppm:
|
114 |
self.ocrpossible = True
|
127 |
self.ocrpossible = True
|
115 |
self.maybemaketmpdir()
|
128 |
self.maybemaketmpdir()
|
116 |
# self.em.rclog("OCRPOSSIBLE: %d" % self.ocrpossible)
|
129 |
# self.em.rclog("OCRPOSSIBLE: %d" % self.ocrpossible)
|
117 |
|
130 |
|
118 |
# Pdftk is optionally used to extract attachments. This takes
|
131 |
# Pdftk is optionally used to extract attachments. This takes
|
119 |
# a hit on perfmance even in the absence of any attachments,
|
132 |
# a hit on performance even in the absence of any attachments,
|
120 |
# so it can be disabled in the configuration.
|
133 |
# so it can be disabled in the configuration.
|
121 |
self.attextractdone = False
|
134 |
self.attextractdone = False
|
122 |
self.attachlist = []
|
135 |
self.attachlist = []
|
|
|
136 |
cf_attach = cf.getConfParam("pdfattach")
|
123 |
if cf_attach:
|
137 |
if cf_attach:
|
124 |
self.pdftk = rclexecm.which("pdftk")
|
138 |
self.pdftk = rclexecm.which("pdftk")
|
125 |
else:
|
|
|
126 |
self.pdftk = None
|
|
|
127 |
if self.pdftk:
|
139 |
if self.pdftk:
|
128 |
self.maybemaketmpdir()
|
140 |
self.maybemaketmpdir()
|
|
|
141 |
|
|
|
142 |
def _initextrameta(self):
|
|
|
143 |
self.pdfinfo = rclexecm.which("pdfinfo")
|
|
|
144 |
if not self.pdfinfo:
|
|
|
145 |
self.pdfinfo = rclexecm.which("poppler/pdfinfo")
|
|
|
146 |
if not self.pdfinfo:
|
|
|
147 |
self.extrameta = None
|
|
|
148 |
return
|
|
|
149 |
|
|
|
150 |
# extrameta is like "samename metanm|rclnm ..."
|
|
|
151 |
# we turn it into a list of pairs
|
|
|
152 |
l = self.extrameta.split()
|
|
|
153 |
self.extrameta = []
|
|
|
154 |
for e in l:
|
|
|
155 |
l1 = e.split('|')
|
|
|
156 |
if len(l1) == 1:
|
|
|
157 |
l1.append(l1[0])
|
|
|
158 |
self.extrameta.append(l1)
|
|
|
159 |
|
|
|
160 |
# Using lxml because it is better with
|
|
|
161 |
# namespaces. With xml, we'd have to walk the XML tree
|
|
|
162 |
# first, extracting all xmlns attributes and
|
|
|
163 |
# constructing a tree (I tried and did not succeed in
|
|
|
164 |
# doing this actually). lxml does it partially for
|
|
|
165 |
# us. See http://stackoverflow.com/questions/14853243/
|
|
|
166 |
# parsing-xml-with-namespace-in-python-via-elementtree
|
|
|
167 |
global ET
|
|
|
168 |
#import xml.etree.ElementTree as ET
|
129 |
|
169 |
try:
|
|
|
170 |
import lxml.etree as ET
|
|
|
171 |
except Exception as err:
|
|
|
172 |
self.em.rclog("Can't import lxml etree: %s" % err)
|
|
|
173 |
self.extrameta = None
|
|
|
174 |
self.pdfinfo = None
|
|
|
175 |
return
|
|
|
176 |
|
|
|
177 |
self.re_head = re.compile(r'<head>', re.IGNORECASE)
|
|
|
178 |
self.re_xmlpacket = re.compile(r'<\?xpacket[ ]+begin.*\?>' +
|
|
|
179 |
r'(.*)' + r'<\?xpacket[ ]+end',
|
|
|
180 |
flags = re.DOTALL)
|
|
|
181 |
|
130 |
# Extract all attachments if any into temporary directory
|
182 |
# Extract all attachments if any into temporary directory
|
131 |
def extractAttach(self):
|
183 |
def extractAttach(self):
|
132 |
if self.attextractdone:
|
184 |
if self.attextractdone:
|
133 |
return True
|
185 |
return True
|
134 |
self.attextractdone = True
|
186 |
self.attextractdone = True
|
|
... |
|
... |
242 |
for f in files:
|
294 |
for f in files:
|
243 |
data += open(f, "r").read()
|
295 |
data += open(f, "r").read()
|
244 |
|
296 |
|
245 |
if not data:
|
297 |
if not data:
|
246 |
return ""
|
298 |
return ""
|
|
|
299 |
return '''<html><head>
|
247 |
return '''<html><head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"></head><body><pre>''' + \
|
300 |
<meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">
|
|
|
301 |
</head><body><pre>''' + \
|
248 |
self.em.htmlescape(data) + \
|
302 |
self.em.htmlescape(data) + \
|
249 |
'''</pre></body></html>'''
|
303 |
'''</pre></body></html>'''
|
|
|
304 |
|
250 |
|
305 |
|
251 |
# pdftotext (used to?) badly escape text inside the header
|
306 |
# pdftotext (used to?) badly escape text inside the header
|
252 |
# fields. We do it here. This is not an html parser, and depends a
|
307 |
# fields. We do it here. This is not an html parser, and depends a
|
253 |
# lot on the actual format output by pdftotext.
|
308 |
# lot on the actual format output by pdftotext.
|
254 |
# We also determine if the doc has actual content, for triggering OCR
|
309 |
# We also determine if the doc has actual content, for triggering OCR
|
|
... |
|
... |
297 |
inbody = True
|
352 |
inbody = True
|
298 |
|
353 |
|
299 |
output += line + b'\n'
|
354 |
output += line + b'\n'
|
300 |
|
355 |
|
301 |
return output, isempty
|
356 |
return output, isempty
|
|
|
357 |
|
|
|
358 |
def _metatag(self, nm, val):
|
|
|
359 |
return "<meta name=\"" + nm + "\" content=\"" + \
|
|
|
360 |
self.em.htmlescape(val) + "\">"
|
|
|
361 |
|
|
|
362 |
# metaheaders is a list of (nm, value) pairs
|
|
|
363 |
def _injectmeta(self, html, metaheaders):
|
|
|
364 |
metatxt = ''
|
|
|
365 |
for nm, val in metaheaders:
|
|
|
366 |
metatxt += self._metatag(nm, val) + '\n'
|
|
|
367 |
if not metatxt:
|
|
|
368 |
return html
|
|
|
369 |
res = self.re_head.sub('<head>\n' + metatxt, html)
|
|
|
370 |
#self.em.rclog("Substituted html: [%s]"%res)
|
|
|
371 |
if res:
|
|
|
372 |
return res
|
|
|
373 |
else:
|
|
|
374 |
return html
|
|
|
375 |
|
|
|
376 |
def _xmltreetext(self, elt):
|
|
|
377 |
'''Extract all text content from subtree'''
|
|
|
378 |
text = ''
|
|
|
379 |
for e in elt.iter():
|
|
|
380 |
if e.text:
|
|
|
381 |
text += e.text + " "
|
|
|
382 |
return text.strip()
|
|
|
383 |
# or: return reduce((lambda t,p : t+p+' '),
|
|
|
384 |
# [e.text for e in elt.iter() if e.text]).strip()
|
302 |
|
385 |
|
|
|
386 |
def _setextrameta(self, html):
|
|
|
387 |
if not self.pdfinfo:
|
|
|
388 |
return
|
|
|
389 |
|
|
|
390 |
all = subprocess.check_output([self.pdfinfo, "-meta", self.filename])
|
|
|
391 |
|
|
|
392 |
# Extract the XML packet
|
|
|
393 |
res = self.re_xmlpacket.search(all)
|
|
|
394 |
xml = ''
|
|
|
395 |
if res:
|
|
|
396 |
xml = res.group(1)
|
|
|
397 |
# self.em.rclog("extrameta: XML: [%s]" % xml)
|
|
|
398 |
if not xml:
|
|
|
399 |
return html
|
|
|
400 |
|
|
|
401 |
metaheaders = []
|
|
|
402 |
# The namespace thing is a drag. Can't do it from the top. See
|
|
|
403 |
# the stackoverflow ref above. Maybe we'd be better off just
|
|
|
404 |
# walking the full tree and building the namespaces dict.
|
|
|
405 |
root = ET.fromstring(xml)
|
|
|
406 |
#self.em.rclog("NSMAP: %s"% root.nsmap)
|
|
|
407 |
namespaces = {'rdf' : "http://www.w3.org/1999/02/22-rdf-syntax-ns#"}
|
|
|
408 |
rdf = root.find("rdf:RDF", namespaces)
|
|
|
409 |
#self.em.rclog("RDF NSMAP: %s"% rdf.nsmap)
|
|
|
410 |
rdfdesclist = rdf.findall("rdf:Description", rdf.nsmap)
|
|
|
411 |
#self.em.rclog("RDFDESC NSMAP: %s"% rdfdesc.nsmap)
|
|
|
412 |
for metanm,rclnm in self.extrameta:
|
|
|
413 |
for rdfdesc in rdfdesclist:
|
|
|
414 |
try:
|
|
|
415 |
elt = rdfdesc.find(metanm, rdfdesc.nsmap)
|
|
|
416 |
except:
|
|
|
417 |
# We get an exception when this rdf:Description does not
|
|
|
418 |
# define the required namespace.
|
|
|
419 |
continue
|
|
|
420 |
if elt is not None:
|
|
|
421 |
text = self._xmltreetext(elt)
|
|
|
422 |
if text:
|
|
|
423 |
# Should we set empty values ?
|
|
|
424 |
# Can't use setfield as it only works for
|
|
|
425 |
# text/plain output at the moment.
|
|
|
426 |
metaheaders.append((rclnm, text))
|
|
|
427 |
if metaheaders:
|
|
|
428 |
return self._injectmeta(html, metaheaders)
|
|
|
429 |
|
303 |
def _selfdoc(self):
|
430 |
def _selfdoc(self):
|
|
|
431 |
'''Extract the text from the pdf doc (as opposed to attachment)'''
|
304 |
self.em.setmimetype('text/html')
|
432 |
self.em.setmimetype('text/html')
|
305 |
|
433 |
|
306 |
if self.attextractdone and len(self.attachlist) == 0:
|
434 |
if self.attextractdone and len(self.attachlist) == 0:
|
307 |
eof = rclexecm.RclExecM.eofnext
|
435 |
eof = rclexecm.RclExecM.eofnext
|
308 |
else:
|
436 |
else:
|
309 |
eof = rclexecm.RclExecM.noteof
|
437 |
eof = rclexecm.RclExecM.noteof
|
310 |
|
438 |
|
311 |
data = subprocess.check_output([self.pdftotext, "-htmlmeta", "-enc",
|
439 |
html = subprocess.check_output([self.pdftotext, "-htmlmeta", "-enc",
|
312 |
"UTF-8", "-eol", "unix", "-q",
|
440 |
"UTF-8", "-eol", "unix", "-q",
|
313 |
self.filename, "-"])
|
441 |
self.filename, "-"])
|
314 |
|
442 |
|
315 |
data, isempty = self._fixhtml(data)
|
443 |
html, isempty = self._fixhtml(html)
|
316 |
#self.em.rclog("ISEMPTY: %d : data: \n%s" % (isempty, data))
|
444 |
#self.em.rclog("ISEMPTY: %d : data: \n%s" % (isempty, html))
|
|
|
445 |
|
317 |
if isempty and self.ocrpossible:
|
446 |
if isempty and self.ocrpossible:
|
318 |
data = self.ocrpdf()
|
447 |
html = self.ocrpdf()
|
|
|
448 |
|
|
|
449 |
if self.extrameta:
|
|
|
450 |
try:
|
|
|
451 |
html = self._setextrameta(html)
|
|
|
452 |
except Exception as err:
|
|
|
453 |
self.em.rclog("Metadata extraction failed: %s" % err)
|
|
|
454 |
|
319 |
return (True, data, "", eof)
|
455 |
return (True, html, "", eof)
|
320 |
|
456 |
|
321 |
def maybemaketmpdir(self):
|
457 |
def maybemaketmpdir(self):
|
322 |
global tmpdir
|
458 |
global tmpdir
|
323 |
if tmpdir:
|
459 |
if tmpdir:
|
324 |
if not vacuumdir(tmpdir):
|
460 |
if not vacuumdir(tmpdir):
|
|
... |
|
... |
327 |
else:
|
463 |
else:
|
328 |
tmpdir = tempfile.mkdtemp(prefix='rclmpdf')
|
464 |
tmpdir = tempfile.mkdtemp(prefix='rclmpdf')
|
329 |
|
465 |
|
330 |
###### File type handler api, used by rclexecm ---------->
|
466 |
###### File type handler api, used by rclexecm ---------->
|
331 |
def openfile(self, params):
|
467 |
def openfile(self, params):
|
|
|
468 |
if not self.pdftotext:
|
|
|
469 |
print("RECFILTERROR HELPERNOTFOUND pdftotext")
|
|
|
470 |
sys.exit(1);
|
|
|
471 |
|
332 |
self.filename = params["filename:"]
|
472 |
self.filename = params["filename:"]
|
333 |
#self.em.rclog("openfile: [%s]" % self.filename)
|
473 |
#self.em.rclog("openfile: [%s]" % self.filename)
|
334 |
self.currentindex = -1
|
474 |
self.currentindex = -1
|
335 |
self.attextractdone = False
|
475 |
self.attextractdone = False
|
336 |
|
|
|
337 |
if not self.pdftotext:
|
|
|
338 |
print("RECFILTERROR HELPERNOTFOUND pdftotext")
|
|
|
339 |
sys.exit(1);
|
|
|
340 |
|
476 |
|
341 |
if self.pdftk:
|
477 |
if self.pdftk:
|
342 |
preview = os.environ.get("RECOLL_FILTER_FORPREVIEW", "no")
|
478 |
preview = os.environ.get("RECOLL_FILTER_FORPREVIEW", "no")
|
343 |
if preview != "yes":
|
479 |
if preview != "yes":
|
344 |
# When indexing, extract attachments at once. This
|
480 |
# When indexing, extract attachments at once. This
|