recoll / Code / Diff of /src/filters/rclzip

Diff of /src/filters/rclzip [7eb182] .. [205fdd]

Switch to side-by-side view

--- a/src/filters/rclzip
+++ b/src/filters/rclzip
@@ -5,13 +5,41 @@
 import rclexecm
 from zipfile import ZipFile
 
+# Note about file names (python 2.6. 2.7, don't know about 3.)
+#
+# There is a bit in zip entries to indicate if the filename is encoded
+# as utf-8 or not. If the bit is set, zipfile decodes the file name
+# and stores it in the catalog as an unicode object. Else it uses a
+# binary string.
+#
+# When reading the file, the input file name is used directly as an
+# index into the catalog.
+#
+# When we send the file name data to the indexer, we have to serialize
+# it as byte string, we can't pass unicode objects to and fro. This
+# means that we have to test if the name is unicode. If it is, we send
+# the string encoded as utf-8. When reading, if the input is utf-8, we
+# turn it to unicode and use this to access the zip member, else we
+# use the binary string.
+# 
+# In the case where an archive member name is a valid non-ascii utf-8
+# string, but the flag is not set (which could probably happen if the
+# archiver did not try to detect utf-8 file names), this will fail,
+# because we'll convert back the utf-8 string to unicode and pass this
+# to zipfile, but an utf-8 string, not a unicode object is actually in
+# the catalog in this case, so the access will fail (will be seen at
+# preview or open time). This does not affect ascii file names because
+# the representation is the same anyway.
+#
+# To avoid this problem, we'd need to pass a separate bit of
+# information indicating that encoding was performed, not just rely on
+# the utf-8 validity test (ie have a 1st char switch), but this would be
+# incompatible with existing indexes. Instead we try both ways...
+#
 class ZipExtractor:
     def __init__(self, em):
         self.currentindex = 0
         self.em = em
-
-    def extractzipentry(self, name):
-        return (ret, data)
 
     def extractone(self, ipath):
         #self.em.rclog("extractone: [%s]" % ipath)
@@ -20,11 +48,13 @@
             docdata = self.zip.read(ipath)
             ok = True
         except Exception, err:
-            self.em.rclog("extractone: failed: [%s]" % err)
+#            self.em.rclog("extractone: failed: [%s]" % err)
             ok = False
         iseof = rclexecm.RclExecM.noteof
         if self.currentindex >= len(self.zip.namelist()) -1:
             iseof = rclexecm.RclExecM.eofnext
+        if isinstance(ipath, unicode):
+            ipath = ipath.encode("utf-8")
         return (ok, docdata, ipath, iseof)
 
     ###### File type handler api, used by rclexecm ---------->
@@ -37,7 +67,16 @@
             return False
 
     def getipath(self, params):
-        return self.extractone(params["ipath:"])
+        ipath = params["ipath:"]
+        ok, data, ipath, eof = self.extractone(ipath)
+        if ok:
+            return (ok, data, ipath, eof)
+        # Not found. Maybe we need to decode the path?
+        try:
+            ipath = ipath.decode("utf-8")
+            return self.extractone(ipath)
+        except Exception, err:
+            return (ok, data, ipath, eof)
         
     def getnext(self, params):
         if self.currentindex >= len(self.zip.namelist()):