--- a
+++ b/src/filters/rcl7z
@@ -0,0 +1,127 @@
+#!/usr/bin/env python
+
+# 7-Zip file filter for Recoll
+
+# Thanks to Recoll user Martin Ziegler
+# This is a modified version of rclzip, with some help from rcltar
+# Python pylzma library required. See http://www.joachim-bauch.de/projects/pylzma/
+
+
+import os
+import fnmatch
+import rclexecm
+
+try:
+ import pylzma
+ from py7zlib import Archive7z
+except:
+ print "RECFILTERROR HELPERNOTFOUND python:pylzma"
+ sys.exit(1);
+
+try:
+ from recoll import rclconfig
+ hasrclconfig = True
+except:
+ hasrclconfig = False
+# As a temporary measure, we also look for rclconfig as a bare
+# module. This is so that the intermediate releases of the filter can
+# ship and use rclconfig.py with the filter code
+if not hasrclconfig:
+ try:
+ import rclconfig
+ hasrclconfig = True
+ except:
+ pass
+
+class SevenZipExtractor:
+ def __init__(self, em):
+ self.currentindex = 0
+ self.em = em
+
+ def extractone(self, ipath):
+ #self.em.rclog("extractone: [%s]" % ipath)
+ docdata = ""
+ try:
+ docdata = self.sevenzip.getmember(ipath).read()
+ ok = True
+ except Exception, err:
+ self.em.rclog("extractone: failed: [%s]" % err)
+ ok = False
+ iseof = rclexecm.RclExecM.noteof
+ if self.currentindex >= len(self.sevenzip.getnames()) -1:
+ iseof = rclexecm.RclExecM.eofnext
+ if isinstance(ipath, unicode):
+ ipath = ipath.encode("utf-8")
+ return (ok, docdata, ipath, iseof)
+
+ ###### File type handler api, used by rclexecm ---------->
+ def openfile(self, params):
+ filename = params["filename:"]
+ self.currentindex = -1
+ self.skiplist = []
+
+ if hasrclconfig:
+ config = rclconfig.RclConfig()
+ config.setKeyDir(os.path.dirname(filename))
+ skipped = config.getConfParam("zipSkippedNames")
+ if skipped is not None:
+ self.skiplist = skipped.split(" ")
+
+ try:
+ fp = open(filename, 'rb')
+ self.sevenzip = Archive7z(fp)
+ return True
+ except Exception, err:
+ self.em.rclog("openfile: failed: [%s]" % err)
+ return False
+
+ def getipath(self, params):
+ ipath = params["ipath:"]
+ ok, data, ipath, eof = self.extractone(ipath)
+ if ok:
+ return (ok, data, ipath, eof)
+ # Not found. Maybe we need to decode the path?
+ try:
+ ipath = ipath.decode("utf-8")
+ return self.extractone(ipath)
+ except Exception, err:
+ return (ok, data, ipath, eof)
+
+ def getnext(self, params):
+ if self.currentindex == -1:
+ # Return "self" doc
+ self.currentindex = 0
+ self.em.setmimetype('text/plain')
+ if len(self.sevenzip.getnames()) == 0:
+ eof = rclexecm.RclExecM.eofnext
+ else:
+ eof = rclexecm.RclExecM.noteof
+ return (True, "", "", eof)
+
+ if self.currentindex >= len(self.sevenzip.getnames()):
+ #self.em.rclog("getnext: EOF hit")
+ return (False, "", "", rclexecm.RclExecM.eofnow)
+ else:
+ entryname = self.sevenzip.getnames()[self.currentindex]
+
+ if hasrclconfig and len(self.skiplist) != 0:
+ while self.currentindex < len(self.sevenzip.getnames()):
+ entryname = self.sevenzip.getnames()[self.currentindex]
+ for pat in self.skiplist:
+ if fnmatch.fnmatch(entryname, pat):
+ entryname = None
+ break
+ if entryname is not None:
+ break
+ self.currentindex += 1
+ if entryname is None:
+ return (False, "", "", rclexecm.RclExecM.eofnow)
+
+ ret= self.extractone(entryname)
+ self.currentindex += 1
+ return ret
+
+# Main program: create protocol handler and extractor and run them
+proto = rclexecm.RclExecM()
+extract = SevenZipExtractor(proto)
+rclexecm.main(proto, extract)