Switch to unified view

a/src/filters/rclpdf.py b/src/filters/rclpdf.py
...
...
89
            if not self.pdftotext:
89
            if not self.pdftotext:
90
                # No need for anything else. openfile() will return an
90
                # No need for anything else. openfile() will return an
91
                # error at once
91
                # error at once
92
                return
92
                return
93
93
94
        cf = rclconfig.RclConfig()
94
        self.config = rclconfig.RclConfig()
95
        self.confdir = cf.getConfDir()
95
        self.confdir = self.config.getConfDir()
96
97
        # The user can set a list of meta tags to be extracted from
96
        # The user can set a list of meta tags to be extracted from
98
        # the XMP metadata packet. These are specified as
97
        # the XMP metadata packet. These are specified as
99
        # (xmltag,rcltag) pairs
98
        # (xmltag,rcltag) pairs
100
        self.extrameta = cf.getConfParam("pdfextrameta")
99
        self.extrameta = self.config.getConfParam("pdfextrameta")
101
        if self.extrameta:
100
        if self.extrameta:
102
            self._initextrameta()
101
            self._initextrameta()
103
102
104
        # Check if we need to escape portions of text where old
103
        # Check if we need to escape portions of text where old
105
        # versions of pdftotext output raw HTML special characters.
104
        # versions of pdftotext output raw HTML special characters.
...
...
117
        
116
        
118
        # See if we'll try to perform OCR. Need the commands and the
117
        # See if we'll try to perform OCR. Need the commands and the
119
        # either the presence of a file in the config dir (historical)
118
        # either the presence of a file in the config dir (historical)
120
        # or a set config variable.
119
        # or a set config variable.
121
        self.ocrpossible = False
120
        self.ocrpossible = False
122
        cf_doocr = cf.getConfParam("pdfocr")
121
        cf_doocr = self.config.getConfParam("pdfocr")
123
        if cf_doocr or os.path.isfile(os.path.join(self.confdir, "ocrpdf")):
122
        if cf_doocr or os.path.isfile(os.path.join(self.confdir, "ocrpdf")):
124
            self.tesseract = rclexecm.which("tesseract")
123
            self.tesseract = rclexecm.which("tesseract")
125
            if self.tesseract:
124
            if self.tesseract:
126
                self.pdftoppm = rclexecm.which("pdftoppm")
125
                self.pdftoppm = rclexecm.which("pdftoppm")
127
                if self.pdftoppm:
126
                if self.pdftoppm:
...
...
132
        # Pdftk is optionally used to extract attachments. This takes
131
        # Pdftk is optionally used to extract attachments. This takes
133
        # a hit on performance even in the absence of any attachments,
132
        # a hit on performance even in the absence of any attachments,
134
        # so it can be disabled in the configuration.
133
        # so it can be disabled in the configuration.
135
        self.attextractdone = False
134
        self.attextractdone = False
136
        self.attachlist = []
135
        self.attachlist = []
137
        cf_attach = cf.getConfParam("pdfattach")
136
        cf_attach = self.config.getConfParam("pdfattach")
138
        if cf_attach:
137
        if cf_attach:
139
            self.pdftk = rclexecm.which("pdftk")
138
            self.pdftk = rclexecm.which("pdftk")
140
        if self.pdftk:
139
        if self.pdftk:
141
            self.maybemaketmpdir()
140
            self.maybemaketmpdir()
142
141
...
...
222
    # Try to guess tesseract language. This should depend on the input
221
    # Try to guess tesseract language. This should depend on the input
223
    # file, but we have no general way to determine it. So use the
222
    # file, but we have no general way to determine it. So use the
224
    # environment and hope for the best.
223
    # environment and hope for the best.
225
    def guesstesseractlang(self):
224
    def guesstesseractlang(self):
226
        tesseractlang = ""
225
        tesseractlang = ""
226
227
        # First look for a language def file in the file's directory 
227
        pdflangfile = os.path.join(os.path.dirname(self.filename), ".ocrpdflang")
228
        pdflangfile = os.path.join(os.path.dirname(self.filename),
229
                                   b".ocrpdflang")
228
        if os.path.isfile(pdflangfile):
230
        if os.path.isfile(pdflangfile):
229
            tesseractlang = open(pdflangfile, "r").read().strip()
231
            tesseractlang = open(pdflangfile, "r").read().strip()
230
        if tesseractlang:
232
        if tesseractlang:
231
            return tesseractlang
233
            return tesseractlang
232
234
235
        # Then look for a global option. The normal way now that we
236
        # have config reading capability in the handlers is to use the
237
        # config. Then, for backwards compat, environment variable and
238
        # file inside the configuration directory
239
        tesseractlang = self.config.getConfParam("pdfocrlang")
240
        if tesseractlang:
241
            return tesseractlang
233
        tesseractlang = os.environ.get("RECOLL_TESSERACT_LANG", "");
242
        tesseractlang = os.environ.get("RECOLL_TESSERACT_LANG", "");
234
        if tesseractlang:
243
        if tesseractlang:
235
            return tesseractlang
244
            return tesseractlang
236
        
245
        pdflangfile = os.path.join(self.confdir, b"ocrpdf")
237
        tesseractlang = \
246
        if os.path.isfile(pdflangfile):
238
                      open(os.path.join(self.confdir, "ocrpdf"), "r").read().strip()
247
            tesseractlang = open(pdflangfile, "r").read().strip()
239
        if tesseractlang:
248
        if tesseractlang:
240
            return tesseractlang
249
            return tesseractlang
241
250
242
        # Half-assed trial to guess from LANG then default to english
251
        # Half-assed trial to guess from LANG then default to english
243
        localelang = os.environ.get("LANG", "").split("_")[0]
252
        localelang = os.environ.get("LANG", "").split("_")[0]
...
...
283
                                               tesseractlang],
292
                                               tesseractlang],
284
                                              stderr = subprocess.STDOUT)
293
                                              stderr = subprocess.STDOUT)
285
            except Exception as e:
294
            except Exception as e:
286
                self.em.rclog("tesseract failed: %s" % e)
295
                self.em.rclog("tesseract failed: %s" % e)
287
296
288
            errlines = out.split('\n')
297
            errlines = out.split(b'\n')
289
            if len(errlines) > 2:
298
            if len(errlines) > 2:
290
                self.em.rclog("Tesseract error: %s" % out)
299
                self.em.rclog("Tesseract error: %s" % out)
291
300
292
        # Concatenate the result files
301
        # Concatenate the result files
293
        files = glob.glob(tmpfile + "*" + ".txt")
302
        files = glob.glob(tmpfile + "*" + ".txt")