--- a/src/filters/rclpdf.py
+++ b/src/filters/rclpdf.py
@@ -91,13 +91,12 @@
# error at once
return
- cf = rclconfig.RclConfig()
- self.confdir = cf.getConfDir()
-
+ self.config = rclconfig.RclConfig()
+ self.confdir = self.config.getConfDir()
# The user can set a list of meta tags to be extracted from
# the XMP metadata packet. These are specified as
# (xmltag,rcltag) pairs
- self.extrameta = cf.getConfParam("pdfextrameta")
+ self.extrameta = self.config.getConfParam("pdfextrameta")
if self.extrameta:
self._initextrameta()
@@ -119,7 +118,7 @@
# either the presence of a file in the config dir (historical)
# or a set config variable.
self.ocrpossible = False
- cf_doocr = cf.getConfParam("pdfocr")
+ cf_doocr = self.config.getConfParam("pdfocr")
if cf_doocr or os.path.isfile(os.path.join(self.confdir, "ocrpdf")):
self.tesseract = rclexecm.which("tesseract")
if self.tesseract:
@@ -134,7 +133,7 @@
# so it can be disabled in the configuration.
self.attextractdone = False
self.attachlist = []
- cf_attach = cf.getConfParam("pdfattach")
+ cf_attach = self.config.getConfParam("pdfattach")
if cf_attach:
self.pdftk = rclexecm.which("pdftk")
if self.pdftk:
@@ -224,18 +223,28 @@
# environment and hope for the best.
def guesstesseractlang(self):
tesseractlang = ""
- pdflangfile = os.path.join(os.path.dirname(self.filename), ".ocrpdflang")
+
+ # First look for a language def file in the file's directory
+ pdflangfile = os.path.join(os.path.dirname(self.filename),
+ b".ocrpdflang")
if os.path.isfile(pdflangfile):
tesseractlang = open(pdflangfile, "r").read().strip()
if tesseractlang:
return tesseractlang
+ # Then look for a global option. The normal way now that we
+ # have config reading capability in the handlers is to use the
+ # config. Then, for backwards compat, environment variable and
+ # file inside the configuration directory
+ tesseractlang = self.config.getConfParam("pdfocrlang")
+ if tesseractlang:
+ return tesseractlang
tesseractlang = os.environ.get("RECOLL_TESSERACT_LANG", "");
if tesseractlang:
return tesseractlang
-
- tesseractlang = \
- open(os.path.join(self.confdir, "ocrpdf"), "r").read().strip()
+ pdflangfile = os.path.join(self.confdir, b"ocrpdf")
+ if os.path.isfile(pdflangfile):
+ tesseractlang = open(pdflangfile, "r").read().strip()
if tesseractlang:
return tesseractlang
@@ -285,7 +294,7 @@
except Exception as e:
self.em.rclog("tesseract failed: %s" % e)
- errlines = out.split('\n')
+ errlines = out.split(b'\n')
if len(errlines) > 2:
self.em.rclog("Tesseract error: %s" % out)