a/src/filters/rclpdf.py b/src/filters/rclpdf.py
...
...
76
76
77
class PDFExtractor:
77
class PDFExtractor:
78
    def __init__(self, em):
78
    def __init__(self, em):
79
        self.currentindex = 0
79
        self.currentindex = 0
80
        self.pdftotext = None
80
        self.pdftotext = None
81
        self.pdfinfo = None
82
        self.pdftk = None
81
        self.em = em
83
        self.em = em
82
84
        self.tesseract = None
83
        self.confdir = rclconfig.RclConfig().getConfDir()
84
        cf_doocr = rclconfig.RclConfig().getConfParam("pdfocr")
85
        cf_attach = rclconfig.RclConfig().getConfParam("pdfattach")
86
        
85
        
87
        self.pdftotext = rclexecm.which("pdftotext")
86
        self.pdftotext = rclexecm.which("pdftotext")
88
        if not self.pdftotext:
87
        if not self.pdftotext:
89
            self.pdftotext = rclexecm.which("poppler/pdftotext")
88
            self.pdftotext = rclexecm.which("poppler/pdftotext")
89
            # No need for anything else. openfile() will return an
90
            # error at once
91
            return
92
93
        cf = rclconfig.RclConfig()
94
        self.confdir = cf.getConfDir()
95
96
        # The user can set a list of meta tags to be extracted from
97
        # the XMP metadata packet. These are specified as
98
        # (xmltag,rcltag) pairs
99
        self.extrameta = cf.getConfParam("pdfextrameta")
100
        if self.extrameta:
101
            self._initextrameta()
90
102
91
        # Check if we need to escape portions of text where old
103
        # Check if we need to escape portions of text where old
92
        # versions of pdftotext output raw HTML special characters.
104
        # versions of pdftotext output raw HTML special characters.
93
        self.needescape = True
105
        self.needescape = True
94
        try:
106
        try:
...
...
104
        
116
        
105
        # See if we'll try to perform OCR. Need the commands and the
117
        # See if we'll try to perform OCR. Need the commands and the
106
        # either the presence of a file in the config dir (historical)
118
        # either the presence of a file in the config dir (historical)
107
        # or a set config variable.
119
        # or a set config variable.
108
        self.ocrpossible = False
120
        self.ocrpossible = False
121
        cf_doocr = cf.getConfParam("pdfocr")
109
        if cf_doocr or os.path.isfile(os.path.join(self.confdir, "ocrpdf")):
122
        if cf_doocr or os.path.isfile(os.path.join(self.confdir, "ocrpdf")):
110
            self.tesseract = rclexecm.which("tesseract")
123
            self.tesseract = rclexecm.which("tesseract")
111
            if self.tesseract:
124
            if self.tesseract:
112
                self.pdftoppm = rclexecm.which("pdftoppm")
125
                self.pdftoppm = rclexecm.which("pdftoppm")
113
                if self.pdftoppm:
126
                if self.pdftoppm:
114
                    self.ocrpossible = True
127
                    self.ocrpossible = True
115
                    self.maybemaketmpdir()
128
                    self.maybemaketmpdir()
116
        # self.em.rclog("OCRPOSSIBLE: %d" % self.ocrpossible)
129
        # self.em.rclog("OCRPOSSIBLE: %d" % self.ocrpossible)
117
130
118
        # Pdftk is optionally used to extract attachments. This takes
131
        # Pdftk is optionally used to extract attachments. This takes
119
        # a hit on perfmance even in the absence of any attachments,
132
        # a hit on performance even in the absence of any attachments,
120
        # so it can be disabled in the configuration.
133
        # so it can be disabled in the configuration.
121
        self.attextractdone = False
134
        self.attextractdone = False
122
        self.attachlist = []
135
        self.attachlist = []
136
        cf_attach = cf.getConfParam("pdfattach")
123
        if cf_attach:
137
        if cf_attach:
124
            self.pdftk = rclexecm.which("pdftk")
138
            self.pdftk = rclexecm.which("pdftk")
125
        else:
126
            self.pdftk = None
127
        if self.pdftk:
139
        if self.pdftk:
128
            self.maybemaketmpdir()
140
            self.maybemaketmpdir()
141
142
    def _initextrameta(self):
143
        self.pdfinfo = rclexecm.which("pdfinfo")
144
        if not self.pdfinfo:
145
            self.pdfinfo = rclexecm.which("poppler/pdfinfo")
146
        if not self.pdfinfo:
147
            self.extrameta = None
148
            return
149
150
        # extrameta is like "samename metanm|rclnm ..."
151
        # we turn it into a list of pairs
152
        l = self.extrameta.split()
153
        self.extrameta = []
154
        for e in l:
155
            l1 = e.split('|')
156
            if len(l1) == 1:
157
                l1.append(l1[0])
158
            self.extrameta.append(l1)
159
160
        # Using lxml because it is better with
161
        # namespaces. With xml, we'd have to walk the XML tree
162
        # first, extracting all xmlns attributes and
163
        # constructing a tree (I tried and did not succeed in
164
        # doing this actually). lxml does it partially for
165
        # us. See http://stackoverflow.com/questions/14853243/
166
        #    parsing-xml-with-namespace-in-python-via-elementtree
167
        global ET
168
        #import xml.etree.ElementTree as ET
129
        
169
        try:
170
            import lxml.etree as ET
171
        except Exception as err:
172
            self.em.rclog("Can't import lxml etree: %s" % err)
173
            self.extrameta = None
174
            self.pdfinfo = None
175
            return
176
177
        self.re_head = re.compile(r'<head>', re.IGNORECASE)
178
        self.re_xmlpacket = re.compile(r'<\?xpacket[  ]+begin.*\?>' +
179
                                       r'(.*)' + r'<\?xpacket[    ]+end',
180
                                       flags = re.DOTALL)
181
130
    # Extract all attachments if any into temporary directory
182
    # Extract all attachments if any into temporary directory
131
    def extractAttach(self):
183
    def extractAttach(self):
132
        if self.attextractdone:
184
        if self.attextractdone:
133
            return True
185
            return True
134
        self.attextractdone = True
186
        self.attextractdone = True
...
...
242
        for f in files:
294
        for f in files:
243
            data += open(f, "r").read()
295
            data += open(f, "r").read()
244
296
245
        if not data:
297
        if not data:
246
            return ""
298
            return ""
299
        return '''<html><head>
247
        return '''<html><head><meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\"></head><body><pre>''' + \
300
        <meta http-equiv=\"Content-Type\" content=\"text/html; charset=UTF-8\">
301
        </head><body><pre>''' + \
248
        self.em.htmlescape(data) + \
302
        self.em.htmlescape(data) + \
249
        '''</pre></body></html>'''
303
        '''</pre></body></html>'''
304
250
305
251
    # pdftotext (used to?) badly escape text inside the header
306
    # pdftotext (used to?) badly escape text inside the header
252
    # fields. We do it here. This is not an html parser, and depends a
307
    # fields. We do it here. This is not an html parser, and depends a
253
    # lot on the actual format output by pdftotext.
308
    # lot on the actual format output by pdftotext.
254
    # We also determine if the doc has actual content, for triggering OCR
309
    # We also determine if the doc has actual content, for triggering OCR
...
...
297
                inbody = True
352
                inbody = True
298
353
299
            output += line + b'\n'
354
            output += line + b'\n'
300
355
301
        return output, isempty
356
        return output, isempty
357
358
    def _metatag(self, nm, val):
359
        return "<meta name=\"" + nm + "\" content=\"" + \
360
               self.em.htmlescape(val) + "\">"
361
362
    # metaheaders is a list of (nm, value) pairs
363
    def _injectmeta(self, html, metaheaders):
364
        metatxt = ''
365
        for nm, val in metaheaders:
366
            metatxt += self._metatag(nm, val) + '\n'
367
        if not metatxt:
368
            return html
369
        res = self.re_head.sub('<head>\n' + metatxt, html)
370
        #self.em.rclog("Substituted html: [%s]"%res)
371
        if res:
372
            return res
373
        else:
374
            return html
375
    
376
    def _xmltreetext(self, elt):
377
        '''Extract all text content from subtree'''
378
        text = ''
379
        for e in elt.iter():
380
            if e.text:
381
                text += e.text + " "
382
        return text.strip()
383
        # or: return reduce((lambda t,p : t+p+' '),
384
        #       [e.text for e in elt.iter() if e.text]).strip()
302
            
385
        
386
    def _setextrameta(self, html):
387
        if not self.pdfinfo:
388
            return
389
390
        all = subprocess.check_output([self.pdfinfo, "-meta", self.filename])
391
392
        # Extract the XML packet
393
        res = self.re_xmlpacket.search(all)
394
        xml = ''
395
        if res:
396
            xml = res.group(1)
397
        # self.em.rclog("extrameta: XML: [%s]" % xml)
398
        if not xml:
399
            return html
400
401
        metaheaders = []
402
        # The namespace thing is a drag. Can't do it from the top. See
403
        # the stackoverflow ref above. Maybe we'd be better off just
404
        # walking the full tree and building the namespaces dict.
405
        root = ET.fromstring(xml)
406
        #self.em.rclog("NSMAP: %s"% root.nsmap)
407
        namespaces = {'rdf' : "http://www.w3.org/1999/02/22-rdf-syntax-ns#"}
408
        rdf = root.find("rdf:RDF", namespaces)
409
        #self.em.rclog("RDF NSMAP: %s"% rdf.nsmap)
410
        rdfdesclist = rdf.findall("rdf:Description", rdf.nsmap)
411
        #self.em.rclog("RDFDESC NSMAP: %s"% rdfdesc.nsmap)
412
        for metanm,rclnm in self.extrameta:
413
            for rdfdesc in rdfdesclist:
414
                try:
415
                    elt = rdfdesc.find(metanm, rdfdesc.nsmap)
416
                except:
417
                    # We get an exception when this rdf:Description does not
418
                    # define the required namespace.
419
                    continue
420
                if elt is not None:
421
                    text = self._xmltreetext(elt)
422
                    if text:
423
                        # Should we set empty values ?
424
                        # Can't use setfield as it only works for
425
                        # text/plain output at the moment.
426
                        metaheaders.append((rclnm, text))
427
        if metaheaders:
428
            return self._injectmeta(html, metaheaders)
429
    
303
    def _selfdoc(self):
430
    def _selfdoc(self):
431
        '''Extract the text from the pdf doc (as opposed to attachment)'''
304
        self.em.setmimetype('text/html')
432
        self.em.setmimetype('text/html')
305
433
306
        if self.attextractdone and len(self.attachlist) == 0:
434
        if self.attextractdone and len(self.attachlist) == 0:
307
            eof = rclexecm.RclExecM.eofnext
435
            eof = rclexecm.RclExecM.eofnext
308
        else:
436
        else:
309
            eof = rclexecm.RclExecM.noteof
437
            eof = rclexecm.RclExecM.noteof
310
            
438
            
311
        data = subprocess.check_output([self.pdftotext, "-htmlmeta", "-enc",
439
        html = subprocess.check_output([self.pdftotext, "-htmlmeta", "-enc",
312
                                        "UTF-8", "-eol", "unix", "-q",
440
                                        "UTF-8", "-eol", "unix", "-q",
313
                                        self.filename, "-"])
441
                                        self.filename, "-"])
314
442
315
        data, isempty = self._fixhtml(data)
443
        html, isempty = self._fixhtml(html)
316
        #self.em.rclog("ISEMPTY: %d : data: \n%s" % (isempty, data))
444
        #self.em.rclog("ISEMPTY: %d : data: \n%s" % (isempty, html))
445
317
        if isempty and self.ocrpossible:
446
        if isempty and self.ocrpossible:
318
            data = self.ocrpdf()
447
            html = self.ocrpdf()
448
449
        if self.extrameta:
450
            try:
451
                html = self._setextrameta(html)
452
            except Exception as err:
453
                self.em.rclog("Metadata extraction failed: %s" % err)
454
319
        return (True, data, "", eof)
455
        return (True, html, "", eof)
320
456
321
    def maybemaketmpdir(self):
457
    def maybemaketmpdir(self):
322
        global tmpdir
458
        global tmpdir
323
        if tmpdir:
459
        if tmpdir:
324
            if not vacuumdir(tmpdir):
460
            if not vacuumdir(tmpdir):
...
...
327
        else:
463
        else:
328
            tmpdir = tempfile.mkdtemp(prefix='rclmpdf')
464
            tmpdir = tempfile.mkdtemp(prefix='rclmpdf')
329
        
465
        
330
    ###### File type handler api, used by rclexecm ---------->
466
    ###### File type handler api, used by rclexecm ---------->
331
    def openfile(self, params):
467
    def openfile(self, params):
468
        if not self.pdftotext:
469
            print("RECFILTERROR HELPERNOTFOUND pdftotext")
470
            sys.exit(1);
471
332
        self.filename = params["filename:"]
472
        self.filename = params["filename:"]
333
        #self.em.rclog("openfile: [%s]" % self.filename)
473
        #self.em.rclog("openfile: [%s]" % self.filename)
334
        self.currentindex = -1
474
        self.currentindex = -1
335
        self.attextractdone = False
475
        self.attextractdone = False
336
337
        if not self.pdftotext:
338
            print("RECFILTERROR HELPERNOTFOUND pdftotext")
339
            sys.exit(1);
340
476
341
        if self.pdftk:
477
        if self.pdftk:
342
            preview = os.environ.get("RECOLL_FILTER_FORPREVIEW", "no")
478
            preview = os.environ.get("RECOLL_FILTER_FORPREVIEW", "no")
343
            if preview != "yes":
479
            if preview != "yes":
344
                # When indexing, extract attachments at once. This
480
                # When indexing, extract attachments at once. This