Switch to unified view

a/src/filters/rclpdf.py b/src/filters/rclpdf.py
...
...
86
        
86
        
87
        self.pdftotext = rclexecm.which("pdftotext")
87
        self.pdftotext = rclexecm.which("pdftotext")
88
        if not self.pdftotext:
88
        if not self.pdftotext:
89
            self.pdftotext = rclexecm.which("poppler/pdftotext")
89
            self.pdftotext = rclexecm.which("poppler/pdftotext")
90
90
91
        # Check if we need to escape portions of text where old
92
        # versions of pdftotext output raw HTML special characters.
93
        self.needescape = True
94
        try:
95
            version = subprocess.check_output([self.pdftotext, "-v"],
96
                                              stderr=subprocess.STDOUT)
97
            major,minor,rev = version.split()[2].split('.')
98
            # Don't know exactly when this changed but it's fixed in
99
            # jessie 0.26.5
100
            if int(major) > 0 or int(minor) >= 26:
101
                self.needescape = False
102
        except:
103
            pass
104
        
91
        # See if we'll try to perform OCR. Need the commands and the
105
        # See if we'll try to perform OCR. Need the commands and the
92
        # either the presence of a file in the config dir (historical)
106
        # either the presence of a file in the config dir (historical)
93
        # or a set config variable.
107
        # or a set config variable.
94
        self.ocrpossible = False
108
        self.ocrpossible = False
95
        if cf_doocr or os.path.isfile(os.path.join(self.confdir, "ocrpdf")):
109
        if cf_doocr or os.path.isfile(os.path.join(self.confdir, "ocrpdf")):
...
...
253
            if inheader:
267
            if inheader:
254
                if not didcs:
268
                if not didcs:
255
                    output += b'<meta http-equiv="Content-Type"' + \
269
                    output += b'<meta http-equiv="Content-Type"' + \
256
                              b'content="text/html; charset=UTF-8">\n'
270
                              b'content="text/html; charset=UTF-8">\n'
257
                    didcs = True
271
                    didcs = True
258
272
                if self.needescape:
259
                m = re.search(b'''(.*<title>)(.*)(<\/title>.*)''', line)
273
                    m = re.search(b'''(.*<title>)(.*)(<\/title>.*)''', line)
260
                if not m:
274
                    if not m:
261
                    m = re.search(b'''(.*content=")(.*)(".*/>.*)''', line)
275
                        m = re.search(b'''(.*content=")(.*)(".*/>.*)''', line)
262
                if m:
276
                    if m:
263
                    line = m.group(1) + self.em.htmlescape(m.group(2)) + \
277
                        line = m.group(1) + self.em.htmlescape(m.group(2)) + \
264
                           m.group(3)
278
                               m.group(3)
265
279
266
                # Recoll treats "Subject" as a "title" element
280
                # Recoll treats "Subject" as a "title" element
267
                # (based on emails). The PDF "Subject" metadata
281
                # (based on emails). The PDF "Subject" metadata
268
                # field is more like an HTML "description"
282
                # field is more like an HTML "description"
269
                line = re.sub(b'name="Subject"', b'name="Description"', line, 1)
283
                line = re.sub(b'name="Subject"', b'name="Description"', line, 1)