Parent:
[dfe00a]
(diff)
Download this file
rclwar
62 lines (51 with data), 1.8 kB
#!/usr/bin/env python2
# WAR web archive filter for recoll. War file are gzipped tar files
from __future__ import print_function
import rclexecm
import tarfile
class WarExtractor:
def __init__(self, em):
self.em = em
def extractone(self, tarinfo):
docdata = ""
try:
member = self.tar.extractfile(tarinfo)
docdata = member.read()
ok = True
except Exception as err:
self.em.rclog("extractone: failed: [%s]" % err)
ok = False
return (ok, docdata, tarinfo.name, rclexecm.RclExecM.noteof)
###### File type handler api, used by rclexecm ---------->
def openfile(self, params):
self.currentindex = -1
try:
self.tar = tarfile.open(params["filename:"])
return True
except Exception as err:
self.em.rclog(str(err))
return False
def getipath(self, params):
ipath = params["ipath:"]
try:
tarinfo = self.tar.getmember(ipath)
except Exception as err:
self.em.rclog(str(err))
return (False, "", ipath, rclexecm.RclExecM.noteof)
return self.extractone(tarinfo)
def getnext(self, params):
if self.currentindex == -1:
# Return "self" doc
self.currentindex = 0
return (True, "", "", rclexecm.RclExecM.noteof)
tarinfo = self.tar.next()
if tarinfo is None:
#self.em.rclog("getnext: EOF hit")
return (False, "", "", rclexecm.RclExecM.eofnow)
else:
ret = self.extractone(tarinfo)
return ret
# Main program: create protocol handler and extractor and run them
proto = rclexecm.RclExecM()
extract = WarExtractor(proto)
rclexecm.main(proto, extract)