|
a/src/filters/rclpdf.py |
|
b/src/filters/rclpdf.py |
|
... |
|
... |
86 |
|
86 |
|
87 |
self.pdftotext = rclexecm.which("pdftotext")
|
87 |
self.pdftotext = rclexecm.which("pdftotext")
|
88 |
if not self.pdftotext:
|
88 |
if not self.pdftotext:
|
89 |
self.pdftotext = rclexecm.which("poppler/pdftotext")
|
89 |
self.pdftotext = rclexecm.which("poppler/pdftotext")
|
90 |
|
90 |
|
|
|
91 |
# Check if we need to escape portions of text where old
|
|
|
92 |
# versions of pdftotext output raw HTML special characters.
|
|
|
93 |
self.needescape = True
|
|
|
94 |
try:
|
|
|
95 |
version = subprocess.check_output([self.pdftotext, "-v"],
|
|
|
96 |
stderr=subprocess.STDOUT)
|
|
|
97 |
major,minor,rev = version.split()[2].split('.')
|
|
|
98 |
# Don't know exactly when this changed but it's fixed in
|
|
|
99 |
# jessie 0.26.5
|
|
|
100 |
if int(major) > 0 or int(minor) >= 26:
|
|
|
101 |
self.needescape = False
|
|
|
102 |
except:
|
|
|
103 |
pass
|
|
|
104 |
|
91 |
# See if we'll try to perform OCR. Need the commands and the
|
105 |
# See if we'll try to perform OCR. Need the commands and the
|
92 |
# either the presence of a file in the config dir (historical)
|
106 |
# either the presence of a file in the config dir (historical)
|
93 |
# or a set config variable.
|
107 |
# or a set config variable.
|
94 |
self.ocrpossible = False
|
108 |
self.ocrpossible = False
|
95 |
if cf_doocr or os.path.isfile(os.path.join(self.confdir, "ocrpdf")):
|
109 |
if cf_doocr or os.path.isfile(os.path.join(self.confdir, "ocrpdf")):
|
|
... |
|
... |
253 |
if inheader:
|
267 |
if inheader:
|
254 |
if not didcs:
|
268 |
if not didcs:
|
255 |
output += b'<meta http-equiv="Content-Type"' + \
|
269 |
output += b'<meta http-equiv="Content-Type"' + \
|
256 |
b'content="text/html; charset=UTF-8">\n'
|
270 |
b'content="text/html; charset=UTF-8">\n'
|
257 |
didcs = True
|
271 |
didcs = True
|
258 |
|
272 |
if self.needescape:
|
259 |
m = re.search(b'''(.*<title>)(.*)(<\/title>.*)''', line)
|
273 |
m = re.search(b'''(.*<title>)(.*)(<\/title>.*)''', line)
|
260 |
if not m:
|
274 |
if not m:
|
261 |
m = re.search(b'''(.*content=")(.*)(".*/>.*)''', line)
|
275 |
m = re.search(b'''(.*content=")(.*)(".*/>.*)''', line)
|
262 |
if m:
|
276 |
if m:
|
263 |
line = m.group(1) + self.em.htmlescape(m.group(2)) + \
|
277 |
line = m.group(1) + self.em.htmlescape(m.group(2)) + \
|
264 |
m.group(3)
|
278 |
m.group(3)
|
265 |
|
279 |
|
266 |
# Recoll treats "Subject" as a "title" element
|
280 |
# Recoll treats "Subject" as a "title" element
|
267 |
# (based on emails). The PDF "Subject" metadata
|
281 |
# (based on emails). The PDF "Subject" metadata
|
268 |
# field is more like an HTML "description"
|
282 |
# field is more like an HTML "description"
|
269 |
line = re.sub(b'name="Subject"', b'name="Description"', line, 1)
|
283 |
line = re.sub(b'name="Subject"', b'name="Description"', line, 1)
|