recoll / Code / Diff of /src/filters/rclmpdf.py

Diff of /src/filters/rclmpdf.py [d688b2] .. [683a25]

Switch to side-by-side view

--- a/src/filters/rclmpdf.py
+++ b/src/filters/rclmpdf.py
@@ -120,24 +120,24 @@
         inheader = False
         inbody = False
         didcs = False
-        output = ''
-        cont = ''
-        for line in input.split('\n'):
+        output = b''
+        cont = b''
+        for line in input.split(b'\n'):
             line = cont + line
-            cont = ''
-            if re.search('</head>', line):
+            cont = b''
+            if re.search(b'</head>', line):
                 inheader = False
-            if re.search('</pre>', line):
+            if re.search(b'</pre>', line):
                 inbody = False
             if inheader:
                 if not didcs:
-                    output += '<meta http-equiv="Content-Type"' + \
-                              'content="text/html; charset=UTF-8">\n'
+                    output += b'<meta http-equiv="Content-Type"' + \
+                              b'content="text/html; charset=UTF-8">\n'
                     didcs = True
 
-                m = re.search(r'(.*<title>)(.*)(<\/title>.*)', line)
+                m = re.search(rb'(.*<title>)(.*)(<\/title>.*)', line)
                 if not m:
-                    m = re.search(r'(.*content=")(.*)(".*/>.*)', line)
+                    m = re.search(rb'(.*content=")(.*)(".*/>.*)', line)
                 if m:
                     line = m.group(1) + self.em.htmlescape(m.group(2)) + \
                            m.group(3)
@@ -145,7 +145,7 @@
                 # Recoll treats "Subject" as a "title" element
                 # (based on emails). The PDF "Subject" metadata
                 # field is more like an HTML "description"
-                line = re.sub('name="Subject"', 'name="Description"', line, 1)
+                line = re.sub(b'name="Subject"', b'name="Description"', line, 1)
 
             elif inbody:
                 # Remove end-of-line hyphenation. It's not clear that
@@ -158,12 +158,12 @@
                         #cont = m.group(2).rstrip('-')
                 line = self.em.htmlescape(line)
                 
-            if re.search('<head>', line):
+            if re.search(b'<head>', line):
                 inheader = True
-            if re.search('<pre>', line):
+            if re.search(b'<pre>', line):
                 inbody = True
 
-            output += line + '\n'
+            output += line + b'\n'
 
         return output