Parent: [dfe00a] (diff)

Child: [d115bc] (diff)

Download this file

rclmpdf.py    257 lines (220 with data), 8.5 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
#!/usr/bin/env python
# Copyright (C) 2014 J.F.Dockes
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the
# Free Software Foundation, Inc.,
# 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
# Recoll PDF extractor, with support for attachments
from __future__ import print_function
import os
import sys
import re
import rclexecm
import subprocess
import tempfile
import atexit
import signal
tmpdir = None
def finalcleanup():
if tmpdir:
vacuumdir(tmpdir)
os.rmdir(tmpdir)
def signal_handler(signal, frame):
sys.exit(1)
atexit.register(finalcleanup)
try:
signal.signal(signal.SIGHUP, signal_handler)
except:
pass
try:
signal.signal(signal.SIGINT, signal_handler)
except:
pass
try:
signal.signal(signal.SIGQUIT, signal_handler)
except:
pass
try:
signal.signal(signal.SIGTERM, signal_handler)
except:
pass
def vacuumdir(dir):
if dir:
for fn in os.listdir(dir):
path = os.path.join(dir, fn)
if os.path.isfile(path):
os.unlink(path)
return True
class PDFExtractor:
def __init__(self, em):
self.currentindex = 0
self.pdftotext = ""
self.pdftk = ""
self.em = em
self.attextractdone = False
self.attachlist = []
# Extract all attachments if any into temporary directory
def extractAttach(self):
if self.attextractdone:
return True
self.attextractdone = True
global tmpdir
if not tmpdir or not self.pdftk:
# no big deal
return True
try:
vacuumdir(tmpdir)
subprocess.check_call([self.pdftk, self.filename, "unpack_files",
"output", tmpdir])
self.attachlist = sorted(os.listdir(tmpdir))
return True
except Exception as e:
self.em.rclog("extractAttach: failed: %s" % e)
# Return true anyway, pdf attachments are no big deal
return True
def extractone(self, ipath):
#self.em.rclog("extractone: [%s]" % ipath)
if not self.attextractdone:
if not self.extractAttach():
return (False, "", "", rclexecm.RclExecM.eofnow)
path = os.path.join(tmpdir, ipath)
if os.path.isfile(path):
f = open(path)
docdata = f.read();
f.close()
if self.currentindex == len(self.attachlist) - 1:
eof = rclexecm.RclExecM.eofnext
else:
eof = rclexecm.RclExecM.noteof
return (True, docdata, ipath, eof)
# pdftotext (used to?) badly escape text inside the header
# fields. We do it here. This is not an html parser, and depends a
# lot on the actual format output by pdftotext.
def _fixhtml(self, input):
#print input
inheader = False
inbody = False
didcs = False
output = b''
cont = b''
for line in input.split(b'\n'):
line = cont + line
cont = b''
if re.search(b'</head>', line):
inheader = False
if re.search(b'</pre>', line):
inbody = False
if inheader:
if not didcs:
output += b'<meta http-equiv="Content-Type"' + \
b'content="text/html; charset=UTF-8">\n'
didcs = True
m = re.search(rb'(.*<title>)(.*)(<\/title>.*)', line)
if not m:
m = re.search(rb'(.*content=")(.*)(".*/>.*)', line)
if m:
line = m.group(1) + self.em.htmlescape(m.group(2)) + \
m.group(3)
# Recoll treats "Subject" as a "title" element
# (based on emails). The PDF "Subject" metadata
# field is more like an HTML "description"
line = re.sub(b'name="Subject"', b'name="Description"', line, 1)
elif inbody:
# Remove end-of-line hyphenation. It's not clear that
# we should do this as pdftotext without the -layout
# option does it ?
#if re.search(r'[-]$', line):
#m = re.search(r'(.*)[ \t]([^ \t]+)$', line)
#if m:
#line = m.group(1)
#cont = m.group(2).rstrip('-')
line = self.em.htmlescape(line)
if re.search(b'<head>', line):
inheader = True
if re.search(b'<pre>', line):
inbody = True
output += line + b'\n'
return output
def _selfdoc(self):
self.em.setmimetype('text/html')
if self.attextractdone and len(self.attachlist) == 0:
eof = rclexecm.RclExecM.eofnext
else:
eof = rclexecm.RclExecM.noteof
data = subprocess.check_output([self.pdftotext, "-htmlmeta", "-enc",
"UTF-8", "-eol", "unix", "-q",
self.filename, "-"])
data = self._fixhtml(data)
#self.em.rclog("%s" % data)
return (True, data, "", eof)
###### File type handler api, used by rclexecm ---------->
def openfile(self, params):
self.filename = params["filename:"]
#self.em.rclog("openfile: [%s]" % self.filename)
self.currentindex = -1
self.attextractdone = False
if not self.pdftotext:
self.pdftotext = rclexecm.which("pdftotext")
if not self.pdftotext:
self.pdftotext = rclexecm.which("poppler/pdftotext")
if not self.pdftotext:
print("RECFILTERROR HELPERNOTFOUND pdftotext")
sys.exit(1);
if not self.pdftk:
self.pdftk = rclexecm.which("pdftk")
if self.pdftk:
global tmpdir
if tmpdir:
if not vacuumdir(tmpdir):
self.em.rclog("openfile: vacuumdir %s failed" % tmpdir)
return False
else:
tmpdir = tempfile.mkdtemp(prefix='rclmpdf')
preview = os.environ.get("RECOLL_FILTER_FORPREVIEW", "no")
if preview != "yes":
# When indexing, extract attachments at once. This
# will be needed anyway and it allows generating an
# eofnext error instead of waiting for actual eof,
# which avoids a bug in recollindex up to 1.20
self.extractAttach()
else:
self.attextractdone = True
return True
def getipath(self, params):
ipath = params["ipath:"]
ok, data, ipath, eof = self.extractone(ipath)
return (ok, data, ipath, eof)
def getnext(self, params):
# self.em.rclog("getnext: current %d" % self.currentindex)
if self.currentindex == -1:
self.currentindex = 0
return self._selfdoc()
else:
self.em.setmimetype('')
if not self.attextractdone:
if not self.extractAttach():
return (False, "", "", rclexecm.RclExecM.eofnow)
if self.currentindex >= len(self.attachlist):
return (False, "", "", rclexecm.RclExecM.eofnow)
try:
ok, data, ipath, eof = \
self.extractone(self.attachlist[self.currentindex])
self.currentindex += 1
#self.em.rclog("getnext: returning ok for [%s]" % ipath)
return (ok, data, ipath, eof)
except:
return (False, "", "", rclexecm.RclExecM.eofnow)
# Main program: create protocol handler and extractor and run them
proto = rclexecm.RclExecM()
extract = PDFExtractor(proto)
rclexecm.main(proto, extract)