recoll / Code / Diff of /src/filters/rclpdf.py

Diff of /src/filters/rclpdf.py [fc537d] .. [b492d2]

Switch to unified view


...
        
        self.pdftotext = rclexecm.which("pdftotext")
        if not self.pdftotext:
            self.pdftotext = rclexecm.which("poppler/pdftotext")

        # Check if we need to escape portions of text where old
        # versions of pdftotext output raw HTML special characters.
        self.needescape = True
        try:
            version = subprocess.check_output([self.pdftotext, "-v"],
                                              stderr=subprocess.STDOUT)
            major,minor,rev = version.split()[2].split('.')
            # Don't know exactly when this changed but it's fixed in
            # jessie 0.26.5
            if int(major) > 0 or int(minor) >= 26:
                self.needescape = False
        except:
            pass
        
        # See if we'll try to perform OCR. Need the commands and the
        # either the presence of a file in the config dir (historical)
        # or a set config variable.
        self.ocrpossible = False
        if cf_doocr or os.path.isfile(os.path.join(self.confdir, "ocrpdf")):
...
            if inheader:
                if not didcs:
                    output += b'<meta http-equiv="Content-Type"' + \
                              b'content="text/html; charset=UTF-8">\n'
                    didcs = True
                if self.needescape:
                    m = re.search(b'''(.*<title>)(.*)(<\/title>.*)''', line)
                    if not m:
                        m = re.search(b'''(.*content=")(.*)(".*/>.*)''', line)
                    if m:
                        line = m.group(1) + self.em.htmlescape(m.group(2)) + \
                               m.group(3)

                # Recoll treats "Subject" as a "title" element
                # (based on emails). The PDF "Subject" metadata
                # field is more like an HTML "description"
                line = re.sub(b'name="Subject"', b'name="Description"', line, 1)

	a/src/filters/rclpdf.py		b/src/filters/rclpdf.py
	...		...
86		86
87	self.pdftotext = rclexecm.which("pdftotext")	87	self.pdftotext = rclexecm.which("pdftotext")
88	if not self.pdftotext:	88	if not self.pdftotext:
89	self.pdftotext = rclexecm.which("poppler/pdftotext")	89	self.pdftotext = rclexecm.which("poppler/pdftotext")
90		90
		91	# Check if we need to escape portions of text where old
		92	# versions of pdftotext output raw HTML special characters.
		93	self.needescape = True
		94	try:
		95	version = subprocess.check_output([self.pdftotext, "-v"],
		96	stderr=subprocess.STDOUT)
		97	major,minor,rev = version.split()[2].split('.')
		98	# Don't know exactly when this changed but it's fixed in
		99	# jessie 0.26.5
		100	if int(major) > 0 or int(minor) >= 26:
		101	self.needescape = False
		102	except:
		103	pass
		104
91	# See if we'll try to perform OCR. Need the commands and the	105	# See if we'll try to perform OCR. Need the commands and the
92	# either the presence of a file in the config dir (historical)	106	# either the presence of a file in the config dir (historical)
93	# or a set config variable.	107	# or a set config variable.
94	self.ocrpossible = False	108	self.ocrpossible = False
95	if cf_doocr or os.path.isfile(os.path.join(self.confdir, "ocrpdf")):	109	if cf_doocr or os.path.isfile(os.path.join(self.confdir, "ocrpdf")):
	...		...
253	if inheader:	267	if inheader:
254	if not didcs:	268	if not didcs:
255	output += b'<meta http-equiv="Content-Type"' + \	269	output += b'<meta http-equiv="Content-Type"' + \
256	b'content="text/html; charset=UTF-8">\n'	270	b'content="text/html; charset=UTF-8">\n'
257	didcs = True	271	didcs = True
258		272	if self.needescape:
259	m = re.search(b'''(.<title>)(.)(<\/title>.*)''', line)	273	m = re.search(b'''(.<title>)(.)(<\/title>.*)''', line)
260	if not m:	274	if not m:
261	m = re.search(b'''(.content=")(.)("./>.)''', line)	275	m = re.search(b'''(.content=")(.)("./>.)''', line)
262	if m:	276	if m:
263	line = m.group(1) + self.em.htmlescape(m.group(2)) + \	277	line = m.group(1) + self.em.htmlescape(m.group(2)) + \
264	m.group(3)	278	m.group(3)
265		279
266	# Recoll treats "Subject" as a "title" element	280	# Recoll treats "Subject" as a "title" element
267	# (based on emails). The PDF "Subject" metadata	281	# (based on emails). The PDF "Subject" metadata
268	# field is more like an HTML "description"	282	# field is more like an HTML "description"
269	line = re.sub(b'name="Subject"', b'name="Description"', line, 1)	283	line = re.sub(b'name="Subject"', b'name="Description"', line, 1)