|
a/src/filters/rclpdf.py |
|
b/src/filters/rclpdf.py |
|
... |
|
... |
96 |
# The user can set a list of meta tags to be extracted from
|
96 |
# The user can set a list of meta tags to be extracted from
|
97 |
# the XMP metadata packet. These are specified as
|
97 |
# the XMP metadata packet. These are specified as
|
98 |
# (xmltag,rcltag) pairs
|
98 |
# (xmltag,rcltag) pairs
|
99 |
self.extrameta = cf.getConfParam("pdfextrameta")
|
99 |
self.extrameta = cf.getConfParam("pdfextrameta")
|
100 |
if self.extrameta:
|
100 |
if self.extrameta:
|
|
|
101 |
self.extrametafix = cf.getConfParam("pdfextrametafix")
|
101 |
self._initextrameta()
|
102 |
self._initextrameta()
|
102 |
|
103 |
|
103 |
# Check if we need to escape portions of text where old
|
104 |
# Check if we need to escape portions of text where old
|
104 |
# versions of pdftotext output raw HTML special characters.
|
105 |
# versions of pdftotext output raw HTML special characters.
|
105 |
self.needescape = True
|
106 |
self.needescape = True
|
|
... |
|
... |
176 |
|
177 |
|
177 |
self.re_head = re.compile(r'<head>', re.IGNORECASE)
|
178 |
self.re_head = re.compile(r'<head>', re.IGNORECASE)
|
178 |
self.re_xmlpacket = re.compile(r'<\?xpacket[ ]+begin.*\?>' +
|
179 |
self.re_xmlpacket = re.compile(r'<\?xpacket[ ]+begin.*\?>' +
|
179 |
r'(.*)' + r'<\?xpacket[ ]+end',
|
180 |
r'(.*)' + r'<\?xpacket[ ]+end',
|
180 |
flags = re.DOTALL)
|
181 |
flags = re.DOTALL)
|
181 |
|
182 |
global EMF
|
|
|
183 |
EMF = None
|
|
|
184 |
if self.extrametafix:
|
|
|
185 |
try:
|
|
|
186 |
import imp
|
|
|
187 |
EMF = imp.load_source('pdfextrametafix', self.extrametafix)
|
|
|
188 |
except Exception as err:
|
|
|
189 |
self.em.rclog("Import extrametafix failed: %s" % err)
|
|
|
190 |
pass
|
|
|
191 |
|
182 |
# Extract all attachments if any into temporary directory
|
192 |
# Extract all attachments if any into temporary directory
|
183 |
def extractAttach(self):
|
193 |
def extractAttach(self):
|
184 |
if self.attextractdone:
|
194 |
if self.attextractdone:
|
185 |
return True
|
195 |
return True
|
186 |
self.attextractdone = True
|
196 |
self.attextractdone = True
|
|
... |
|
... |
382 |
return text.strip()
|
392 |
return text.strip()
|
383 |
# or: return reduce((lambda t,p : t+p+' '),
|
393 |
# or: return reduce((lambda t,p : t+p+' '),
|
384 |
# [e.text for e in elt.iter() if e.text]).strip()
|
394 |
# [e.text for e in elt.iter() if e.text]).strip()
|
385 |
|
395 |
|
386 |
|
396 |
|
387 |
# This can be used for local field editing. For now you need to
|
|
|
388 |
# change the program source. maybe we'll make it more dynamic one
|
|
|
389 |
# day. The method receives an (original) field name, and the text
|
|
|
390 |
# value, and should return the possibly modified text.
|
|
|
391 |
def _extrametafix(self, nm, txt):
|
|
|
392 |
if nm == 'bibtex:pages':
|
|
|
393 |
txt = re.sub(r'--', '-', txt)
|
|
|
394 |
elif nm == 'someothername':
|
|
|
395 |
# do something else
|
|
|
396 |
pass
|
|
|
397 |
elif nm == 'stillanother':
|
|
|
398 |
# etc.
|
|
|
399 |
pass
|
|
|
400 |
|
|
|
401 |
return txt
|
|
|
402 |
|
|
|
403 |
|
|
|
404 |
def _setextrameta(self, html):
|
397 |
def _setextrameta(self, html):
|
405 |
if not self.pdfinfo:
|
398 |
if not self.pdfinfo:
|
406 |
return html
|
399 |
return html
|
|
|
400 |
|
|
|
401 |
emf = EMF.MetaFixer() if EMF else None
|
407 |
|
402 |
|
408 |
all = subprocess.check_output([self.pdfinfo, "-meta", self.filename])
|
403 |
all = subprocess.check_output([self.pdfinfo, "-meta", self.filename])
|
409 |
|
404 |
|
410 |
# Extract the XML packet
|
405 |
# Extract the XML packet
|
411 |
res = self.re_xmlpacket.search(all)
|
406 |
res = self.re_xmlpacket.search(all)
|
|
... |
|
... |
443 |
# We get an exception when this rdf:Description does not
|
438 |
# We get an exception when this rdf:Description does not
|
444 |
# define the required namespace.
|
439 |
# define the required namespace.
|
445 |
continue
|
440 |
continue
|
446 |
if elt is not None:
|
441 |
if elt is not None:
|
447 |
text = self._xmltreetext(elt).encode('UTF-8')
|
442 |
text = self._xmltreetext(elt).encode('UTF-8')
|
|
|
443 |
if emf:
|
|
|
444 |
text = emf.metafix(metanm, text)
|
448 |
# Should we set empty values ?
|
445 |
# Should we set empty values ?
|
449 |
if text:
|
446 |
if text:
|
450 |
text = self._extrametafix(metanm, text)
|
|
|
451 |
# Can't use setfield as it only works for
|
447 |
# Can't use setfield as it only works for
|
452 |
# text/plain output at the moment.
|
448 |
# text/plain output at the moment.
|
453 |
metaheaders.append((rclnm, text))
|
449 |
metaheaders.append((rclnm, text))
|
454 |
if metaheaders:
|
450 |
if metaheaders:
|
455 |
return self._injectmeta(html, metaheaders)
|
451 |
return self._injectmeta(html, metaheaders)
|