Switch to unified view

a/src/filters/rclpdf.py b/src/filters/rclpdf.py
...
...
96
        # The user can set a list of meta tags to be extracted from
96
        # The user can set a list of meta tags to be extracted from
97
        # the XMP metadata packet. These are specified as
97
        # the XMP metadata packet. These are specified as
98
        # (xmltag,rcltag) pairs
98
        # (xmltag,rcltag) pairs
99
        self.extrameta = cf.getConfParam("pdfextrameta")
99
        self.extrameta = cf.getConfParam("pdfextrameta")
100
        if self.extrameta:
100
        if self.extrameta:
101
            self.extrametafix = cf.getConfParam("pdfextrametafix")
101
            self._initextrameta()
102
            self._initextrameta()
102
103
103
        # Check if we need to escape portions of text where old
104
        # Check if we need to escape portions of text where old
104
        # versions of pdftotext output raw HTML special characters.
105
        # versions of pdftotext output raw HTML special characters.
105
        self.needescape = True
106
        self.needescape = True
...
...
176
177
177
        self.re_head = re.compile(r'<head>', re.IGNORECASE)
178
        self.re_head = re.compile(r'<head>', re.IGNORECASE)
178
        self.re_xmlpacket = re.compile(r'<\?xpacket[    ]+begin.*\?>' +
179
        self.re_xmlpacket = re.compile(r'<\?xpacket[    ]+begin.*\?>' +
179
                                       r'(.*)' + r'<\?xpacket[  ]+end',
180
                                       r'(.*)' + r'<\?xpacket[  ]+end',
180
                                       flags = re.DOTALL)
181
                                       flags = re.DOTALL)
181
182
        global EMF
183
        EMF = None
184
        if self.extrametafix:
185
            try:
186
                import imp
187
                EMF = imp.load_source('pdfextrametafix', self.extrametafix)
188
            except Exception as err:
189
                self.em.rclog("Import extrametafix failed: %s" % err)
190
                pass
191
                
182
    # Extract all attachments if any into temporary directory
192
    # Extract all attachments if any into temporary directory
183
    def extractAttach(self):
193
    def extractAttach(self):
184
        if self.attextractdone:
194
        if self.attextractdone:
185
            return True
195
            return True
186
        self.attextractdone = True
196
        self.attextractdone = True
...
...
382
        return text.strip()
392
        return text.strip()
383
        # or: return reduce((lambda t,p : t+p+' '),
393
        # or: return reduce((lambda t,p : t+p+' '),
384
        #       [e.text for e in elt.iter() if e.text]).strip()
394
        #       [e.text for e in elt.iter() if e.text]).strip()
385
395
386
396
387
    # This can be used for local field editing. For now you need to
388
    # change the program source. maybe we'll make it more dynamic one
389
    # day. The method receives an (original) field name, and the text
390
    # value, and should return the possibly modified text.
391
    def _extrametafix(self, nm, txt):
392
        if nm == 'bibtex:pages':
393
            txt = re.sub(r'--', '-', txt)
394
        elif nm == 'someothername':
395
            # do something else
396
            pass
397
        elif nm == 'stillanother':
398
            # etc.
399
            pass
400
        
401
        return txt
402
403
404
    def _setextrameta(self, html):
397
    def _setextrameta(self, html):
405
        if not self.pdfinfo:
398
        if not self.pdfinfo:
406
            return html
399
            return html
400
401
        emf = EMF.MetaFixer() if EMF else None
407
402
408
        all = subprocess.check_output([self.pdfinfo, "-meta", self.filename])
403
        all = subprocess.check_output([self.pdfinfo, "-meta", self.filename])
409
404
410
        # Extract the XML packet
405
        # Extract the XML packet
411
        res = self.re_xmlpacket.search(all)
406
        res = self.re_xmlpacket.search(all)
...
...
443
                    # We get an exception when this rdf:Description does not
438
                    # We get an exception when this rdf:Description does not
444
                    # define the required namespace.
439
                    # define the required namespace.
445
                    continue
440
                    continue
446
                if elt is not None:
441
                if elt is not None:
447
                    text = self._xmltreetext(elt).encode('UTF-8')
442
                    text = self._xmltreetext(elt).encode('UTF-8')
443
                    if emf:
444
                        text = emf.metafix(metanm, text)
448
                    # Should we set empty values ?
445
                    # Should we set empty values ?
449
                    if text:
446
                    if text:
450
                        text = self._extrametafix(metanm, text)
451
                        # Can't use setfield as it only works for
447
                        # Can't use setfield as it only works for
452
                        # text/plain output at the moment.
448
                        # text/plain output at the moment.
453
                        metaheaders.append((rclnm, text))
449
                        metaheaders.append((rclnm, text))
454
        if metaheaders:
450
        if metaheaders:
455
            return self._injectmeta(html, metaheaders)
451
            return self._injectmeta(html, metaheaders)