|
a/src/filters/rclpdf.py |
|
b/src/filters/rclpdf.py |
|
... |
|
... |
89 |
if not self.pdftotext:
|
89 |
if not self.pdftotext:
|
90 |
# No need for anything else. openfile() will return an
|
90 |
# No need for anything else. openfile() will return an
|
91 |
# error at once
|
91 |
# error at once
|
92 |
return
|
92 |
return
|
93 |
|
93 |
|
94 |
cf = rclconfig.RclConfig()
|
94 |
self.config = rclconfig.RclConfig()
|
95 |
self.confdir = cf.getConfDir()
|
95 |
self.confdir = self.config.getConfDir()
|
96 |
|
|
|
97 |
# The user can set a list of meta tags to be extracted from
|
96 |
# The user can set a list of meta tags to be extracted from
|
98 |
# the XMP metadata packet. These are specified as
|
97 |
# the XMP metadata packet. These are specified as
|
99 |
# (xmltag,rcltag) pairs
|
98 |
# (xmltag,rcltag) pairs
|
100 |
self.extrameta = cf.getConfParam("pdfextrameta")
|
99 |
self.extrameta = self.config.getConfParam("pdfextrameta")
|
101 |
if self.extrameta:
|
100 |
if self.extrameta:
|
102 |
self._initextrameta()
|
101 |
self._initextrameta()
|
103 |
|
102 |
|
104 |
# Check if we need to escape portions of text where old
|
103 |
# Check if we need to escape portions of text where old
|
105 |
# versions of pdftotext output raw HTML special characters.
|
104 |
# versions of pdftotext output raw HTML special characters.
|
|
... |
|
... |
117 |
|
116 |
|
118 |
# See if we'll try to perform OCR. Need the commands and the
|
117 |
# See if we'll try to perform OCR. Need the commands and the
|
119 |
# either the presence of a file in the config dir (historical)
|
118 |
# either the presence of a file in the config dir (historical)
|
120 |
# or a set config variable.
|
119 |
# or a set config variable.
|
121 |
self.ocrpossible = False
|
120 |
self.ocrpossible = False
|
122 |
cf_doocr = cf.getConfParam("pdfocr")
|
121 |
cf_doocr = self.config.getConfParam("pdfocr")
|
123 |
if cf_doocr or os.path.isfile(os.path.join(self.confdir, "ocrpdf")):
|
122 |
if cf_doocr or os.path.isfile(os.path.join(self.confdir, "ocrpdf")):
|
124 |
self.tesseract = rclexecm.which("tesseract")
|
123 |
self.tesseract = rclexecm.which("tesseract")
|
125 |
if self.tesseract:
|
124 |
if self.tesseract:
|
126 |
self.pdftoppm = rclexecm.which("pdftoppm")
|
125 |
self.pdftoppm = rclexecm.which("pdftoppm")
|
127 |
if self.pdftoppm:
|
126 |
if self.pdftoppm:
|
|
... |
|
... |
132 |
# Pdftk is optionally used to extract attachments. This takes
|
131 |
# Pdftk is optionally used to extract attachments. This takes
|
133 |
# a hit on performance even in the absence of any attachments,
|
132 |
# a hit on performance even in the absence of any attachments,
|
134 |
# so it can be disabled in the configuration.
|
133 |
# so it can be disabled in the configuration.
|
135 |
self.attextractdone = False
|
134 |
self.attextractdone = False
|
136 |
self.attachlist = []
|
135 |
self.attachlist = []
|
137 |
cf_attach = cf.getConfParam("pdfattach")
|
136 |
cf_attach = self.config.getConfParam("pdfattach")
|
138 |
if cf_attach:
|
137 |
if cf_attach:
|
139 |
self.pdftk = rclexecm.which("pdftk")
|
138 |
self.pdftk = rclexecm.which("pdftk")
|
140 |
if self.pdftk:
|
139 |
if self.pdftk:
|
141 |
self.maybemaketmpdir()
|
140 |
self.maybemaketmpdir()
|
142 |
|
141 |
|
|
... |
|
... |
222 |
# Try to guess tesseract language. This should depend on the input
|
221 |
# Try to guess tesseract language. This should depend on the input
|
223 |
# file, but we have no general way to determine it. So use the
|
222 |
# file, but we have no general way to determine it. So use the
|
224 |
# environment and hope for the best.
|
223 |
# environment and hope for the best.
|
225 |
def guesstesseractlang(self):
|
224 |
def guesstesseractlang(self):
|
226 |
tesseractlang = ""
|
225 |
tesseractlang = ""
|
|
|
226 |
|
|
|
227 |
# First look for a language def file in the file's directory
|
227 |
pdflangfile = os.path.join(os.path.dirname(self.filename), ".ocrpdflang")
|
228 |
pdflangfile = os.path.join(os.path.dirname(self.filename),
|
|
|
229 |
b".ocrpdflang")
|
228 |
if os.path.isfile(pdflangfile):
|
230 |
if os.path.isfile(pdflangfile):
|
229 |
tesseractlang = open(pdflangfile, "r").read().strip()
|
231 |
tesseractlang = open(pdflangfile, "r").read().strip()
|
230 |
if tesseractlang:
|
232 |
if tesseractlang:
|
231 |
return tesseractlang
|
233 |
return tesseractlang
|
232 |
|
234 |
|
|
|
235 |
# Then look for a global option. The normal way now that we
|
|
|
236 |
# have config reading capability in the handlers is to use the
|
|
|
237 |
# config. Then, for backwards compat, environment variable and
|
|
|
238 |
# file inside the configuration directory
|
|
|
239 |
tesseractlang = self.config.getConfParam("pdfocrlang")
|
|
|
240 |
if tesseractlang:
|
|
|
241 |
return tesseractlang
|
233 |
tesseractlang = os.environ.get("RECOLL_TESSERACT_LANG", "");
|
242 |
tesseractlang = os.environ.get("RECOLL_TESSERACT_LANG", "");
|
234 |
if tesseractlang:
|
243 |
if tesseractlang:
|
235 |
return tesseractlang
|
244 |
return tesseractlang
|
236 |
|
245 |
pdflangfile = os.path.join(self.confdir, b"ocrpdf")
|
237 |
tesseractlang = \
|
246 |
if os.path.isfile(pdflangfile):
|
238 |
open(os.path.join(self.confdir, "ocrpdf"), "r").read().strip()
|
247 |
tesseractlang = open(pdflangfile, "r").read().strip()
|
239 |
if tesseractlang:
|
248 |
if tesseractlang:
|
240 |
return tesseractlang
|
249 |
return tesseractlang
|
241 |
|
250 |
|
242 |
# Half-assed trial to guess from LANG then default to english
|
251 |
# Half-assed trial to guess from LANG then default to english
|
243 |
localelang = os.environ.get("LANG", "").split("_")[0]
|
252 |
localelang = os.environ.get("LANG", "").split("_")[0]
|
|
... |
|
... |
283 |
tesseractlang],
|
292 |
tesseractlang],
|
284 |
stderr = subprocess.STDOUT)
|
293 |
stderr = subprocess.STDOUT)
|
285 |
except Exception as e:
|
294 |
except Exception as e:
|
286 |
self.em.rclog("tesseract failed: %s" % e)
|
295 |
self.em.rclog("tesseract failed: %s" % e)
|
287 |
|
296 |
|
288 |
errlines = out.split('\n')
|
297 |
errlines = out.split(b'\n')
|
289 |
if len(errlines) > 2:
|
298 |
if len(errlines) > 2:
|
290 |
self.em.rclog("Tesseract error: %s" % out)
|
299 |
self.em.rclog("Tesseract error: %s" % out)
|
291 |
|
300 |
|
292 |
# Concatenate the result files
|
301 |
# Concatenate the result files
|
293 |
files = glob.glob(tmpfile + "*" + ".txt")
|
302 |
files = glob.glob(tmpfile + "*" + ".txt")
|