Switch to unified view

a/src/mediaserver/cdplugins/uprcl/uprclfolders.py b/src/mediaserver/cdplugins/uprcl/uprclfolders.py
...
...
44
# 
44
# 
45
# Entry 0 in _dirvec is special: it holds the 'topdirs' from the recoll
45
# Entry 0 in _dirvec is special: it holds the 'topdirs' from the recoll
46
# configuration. The entries are paths instead of simple names, and
46
# configuration. The entries are paths instead of simple names, and
47
# the docidx is 0. The diridx points to a dirvec entry.
47
# the docidx is 0. The diridx points to a dirvec entry.
48
48
49
50
import os
49
import os
51
import shlex
50
import shlex
52
import urllib
51
import urllib
53
import sys
52
import sys
54
import time
53
import time
...
...
58
from uprclutils import docarturi, audiomtypes, rcldirentry, \
57
from uprclutils import docarturi, audiomtypes, rcldirentry, \
59
     rcldoctoentry, cmpentries
58
     rcldoctoentry, cmpentries
60
from recoll import recoll
59
from recoll import recoll
61
from recoll import rclconfig
60
from recoll import rclconfig
62
61
63
_foldersIdPfx = '0$uprcl$folders'
62
class Folders(object):
64
63
64
    # Initialize (read recoll data and build tree).
65
    def __init__(self, confdir, httphp, pathprefix):
66
        self._idprefix = '0$uprcl$folders'
67
        self._httphp = httphp
68
        self._pprefix = pathprefix
65
# Debug : limit processed recoll entries for speed
69
        # Debug : limit processed recoll entries for speed
66
_maxrclcnt = 0
70
        self._maxrclcnt = 0
71
        self._fetchalldocs(confdir)
72
        self._rcl2folders(confdir)
67
73
68
_dirvec = []
74
    def rcldocs(self):
69
75
        return self._rcldocs
70
76
    
71
# Create new directory entry: insert in father and append dirvec slot
77
    # Create new directory entry: insert in father and append dirvec slot
72
# (with ".." entry)
78
    # (with ".." entry)
73
def _createdir(dirvec, fathidx, docidx, nm):
79
    def _createdir(self, fathidx, docidx, nm):
74
    dirvec.append({})
80
        self._dirvec.append({})
75
    dirvec[fathidx][nm] = (len(dirvec) - 1, docidx)
81
        self._dirvec[fathidx][nm] = (len(self._dirvec) - 1, docidx)
76
    dirvec[-1][".."] = (fathidx, -1)
82
        self._dirvec[-1][".."] = (fathidx, -1)
77
    return len(dirvec) - 1
83
        return len(self._dirvec) - 1
78
84
79
85
80
# Walk the recoll docs array and split the URLs paths to build the
86
    # Walk the recoll docs array and split the URLs paths to build the
81
# [folders] data structure
87
    # [folders] data structure
82
def _rcl2folders(docs, confdir, httphp, pathprefix):
88
    def _rcl2folders(self, confdir):
83
    global dirvec
84
    dirvec = []
89
        self._dirvec = []
85
    start = timer()
90
        start = timer()
86
91
87
    rclconf = rclconfig.RclConfig(confdir)
92
        rclconf = rclconfig.RclConfig(confdir)
88
    topdirs = [os.path.expanduser(d) for d in
93
        topdirs = [os.path.expanduser(d) for d in
89
               shlex.split(rclconf.getConfParam('topdirs'))]
94
                   shlex.split(rclconf.getConfParam('topdirs'))]
90
    topdirs = [d.rstrip('/') for d in topdirs]
95
        topdirs = [d.rstrip('/') for d in topdirs]
91
96
92
    # Create the 1st entry. This is special because it holds the
97
        # Create the 1st entry. This is special because it holds the
93
    # recoll topdirs, which are paths instead of simple names. There
98
        # recoll topdirs, which are paths instead of simple names. There
94
    # does not seem any need to build the tree between a topdir and /
99
        # does not seem any need to build the tree between a topdir and /
95
    dirvec.append({})
100
        self._dirvec.append({})
96
    dirvec[0][".."] = (0, -1)
101
        self._dirvec[0][".."] = (0, -1)
97
    for d in topdirs:
102
        for d in topdirs:
98
        dirvec.append({})
103
            self._dirvec.append({})
99
        dirvec[0][d] = (len(dirvec)-1, -1)
104
            self._dirvec[0][d] = (len(self._dirvec)-1, -1)
100
        dirvec[-1][".."] = (0, -1)
105
            self._dirvec[-1][".."] = (0, -1)
101
106
102
    # Walk the doc list and update the directory tree according to the
107
        # Walk the doc list and update the directory tree according to the
103
    # url: create intermediary directories if needed, create leaf
108
        # url: create intermediary directories if needed, create leaf
104
    # entry.
109
        # entry.
105
    for docidx in range(len(docs)):
110
        for docidx in range(len(self._rcldocs)):
106
        doc = docs[docidx]
111
            doc = self._rcldocs[docidx]
107
            
112
            
108
        # Possibly enrich the doc entry with a cover art uri.
113
            # Possibly enrich the doc entry with a cover art uri.
109
        arturi = docarturi(doc, httphp, pathprefix)
114
            arturi = docarturi(doc, self._httphp, self._pprefix)
110
        if arturi:
115
            if arturi:
111
            # The uri is quoted, so it's ascii and we can just store
116
                # The uri is quoted, so it's ascii and we can just store
112
            # it as a doc attribute
117
                # it as a doc attribute
113
            doc.albumarturi = arturi
118
                doc.albumarturi = arturi
114
119
115
        # No need to include non-audio types in the visible tree.
120
            # No need to include non-audio types in the visible tree.
116
        if doc.mtype not in audiomtypes:
121
            if doc.mtype not in audiomtypes:
117
            continue
122
                continue
118
123
119
        url = doc.getbinurl()
124
            url = doc.getbinurl()
120
        url = url[7:]
125
            url = url[7:]
126
            try:
127
                decoded = url.decode('utf-8')
128
            except:
129
                decoded = urllib.quote(url).decode('utf-8')
130
131
            # Determine the root entry (topdirs element). Special because
132
            # its path is not a simple name.
133
            fathidx = -1
134
            for rtpath,idx in self._dirvec[0].iteritems():
135
                if url.startswith(rtpath):
136
                    fathidx = idx[0]
137
                    break
138
            if fathidx == -1:
139
                uplog("No parent in topdirs: %s" % decoded)
140
                continue
141
142
            # Compute rest of path
143
            url1 = url[len(rtpath):]
144
            if len(url1) == 0:
145
                continue
146
147
            # If there is a contentgroup field, just add it as a virtual
148
            # directory in the path. This only affects the visible tree,
149
            # not the 'real' URLs of course.
150
            if doc.contentgroup:
151
                a = os.path.dirname(url1).decode('utf-8', errors='replace')
152
                b = os.path.basename(url1).decode('utf-8', errors='replace')
153
                url1 = os.path.join(a, doc.contentgroup, b)
154
            
155
            # Split path, then walk the vector, possibly creating
156
            # directory entries as needed
157
            path = url1.split('/')[1:]
158
            #uplog("%s"%path, file=sys.stderr)
159
            for idx in range(len(path)):
160
                elt = path[idx]
161
                if elt in self._dirvec[fathidx]:
162
                    # This path element was already seen
163
                    # If this is the last entry in the path, maybe update
164
                    # the doc idx (previous entries were created for
165
                    # intermediate elements without a Doc).
166
                    if idx == len(path) -1:
167
                        self._dirvec[fathidx][elt] = (self._dirvec[fathidx][elt][0], docidx)
168
                        #uplog("updating docidx for %s" % decoded)
169
                    # Update fathidx for next iteration
170
                    fathidx = self._dirvec[fathidx][elt][0]
171
                else:
172
                    # Element has no entry in father directory (hence no
173
                    # self._dirvec entry either).
174
                    if idx != len(path) -1:
175
                        # This is an intermediate element. Create a
176
                        # Doc-less directory
177
                        fathidx = self._createdir(fathidx, -1, elt)
178
                    else:
179
                        # Last element. If directory, needs a self._dirvec entry
180
                        if doc.mtype == 'inode/directory':
181
                            fathidx = self._createdir(fathidx, docidx, elt)
182
                            #uplog("Setting docidx for %s" % decoded)
183
                        else:
184
                            self._dirvec[fathidx][elt] = (-1, docidx)
185
186
        if False:
187
            for ent in self._dirvec:
188
                uplog("%s" % ent)
189
190
        end = timer()
191
        uplog("_rcl2folders took %.2f Seconds" % (end - start))
192
193
    # Fetch all the docs by querying Recoll with [mime:*], which is
194
    # guaranteed to match every doc without overflowing the query size
195
    # (because the number of mime types is limited). Something like
196
    # title:* would overflow. This creates the main doc array, which is
197
    # then used by all modules.
198
    def _fetchalldocs(self, confdir):
199
        start = timer()
200
201
        rcldb = recoll.connect(confdir=confdir)
202
        rclq = rcldb.query()
203
        rclq.execute("mime:*", stemming=0)
204
        uplog("Estimated alldocs query results: %d" % (rclq.rowcount))
205
206
        totcnt = 0
207
        self._rcldocs = []
208
        while True:
209
            docs = rclq.fetchmany()
210
            for doc in docs:
211
                self._rcldocs.append(doc)
212
                totcnt += 1
213
            if (self._maxrclcnt > 0 and totcnt >= self._maxrclcnt) or \
214
                   len(docs) != rclq.arraysize:
215
                break
216
            time.sleep(0)
217
        end = timer()
218
        uplog("Retrieved %d docs in %.2f Seconds" % (totcnt,end - start))
219
220
221
    ##############
222
    # Browsing the initialized [folders] hierarchy
223
224
    # Extract dirvec index from objid, according to the way we generate them.
225
    def _objidtodiridx(self, pid):
226
        if not pid.startswith(self._idprefix):
227
            raise Exception("folders.browse: bad pid %s" % pid)
228
229
        if len(self._rcldocs) == 0:
230
            raise Exception("folders:browse: no docs")
231
232
        diridx = pid[len(self._idprefix):]
233
        if not diridx:
234
            diridx = 0
235
        else:
236
            if diridx[1] != 'd':
237
                raise Exception("folders:browse: called on non dir objid %s" %
238
                                pid)
239
            diridx = int(diridx[2:])
240
            
241
        if diridx >= len(self._dirvec):
242
            raise Exception("folders:browse: bad pid %s" % pid)
243
244
        return diridx
245
246
247
    # Tell the top module what entries we define in the root
248
    def rootentries(self, pid):
249
        return [rcldirentry(pid + 'folders', pid, '[folders]'),]
250
251
252
    # Look all non-directory docs inside directory, and return the cover
253
    # art we find.
254
    def _arturifordir(self, diridx):
255
        for nm,ids in self._dirvec[diridx].iteritems():
256
            if ids[1] >= 0:
257
                doc = self._rcldocs[ids[1]]
258
                if doc.mtype != 'inode/directory' and doc.albumarturi:
259
                    return doc.albumarturi
260
              
261
262
    # Folder hierarchy browse method.
263
    # objid is like folders$index
264
    # flag is meta or children.
265
    def browse(self, pid, flag):
266
267
        diridx = self._objidtodiridx(pid)
268
269
        # If there is only one entry in root, skip it. This means that 0
270
        # and 1 point to the same dir, but this does not seem to be an
271
        # issue
272
        if diridx == 0 and len(self._dirvec[0]) == 2:
273
            diridx = 1
274
        
275
        entries = []
276
277
        # The basename call is just for diridx==0 (topdirs). Remove it if
278
        # this proves a performance issue
279
        for nm,ids in self._dirvec[diridx].iteritems():
280
            if nm == "..":
281
                continue
282
            thisdiridx = ids[0]
283
            thisdocidx = ids[1]
284
            if thisdocidx >= 0:
285
                doc = self._rcldocs[thisdocidx]
286
            else:
287
                uplog("No doc for %s" % pid)
288
                doc = None
289
            
290
            if thisdiridx >= 0:
291
                # Skip empty directories
292
                if len(self._dirvec[thisdiridx]) == 1:
293
                    continue
294
                id = self._idprefix + '$' + 'd' + str(thisdiridx)
295
                if doc and doc.albumarturi:
296
                    arturi = doc.albumarturi
297
                else:
298
                    arturi = self._arturifordir(thisdiridx)
299
                entries.append(rcldirentry(id, pid, os.path.basename(nm),
300
                                           arturi=arturi))
301
            else:
302
                # Not a directory. docidx had better been set
303
                if thisdocidx == -1:
304
                    uplog("folders:docidx -1 for non-dir entry %s"%nm)
305
                    continue
306
                doc = self._rcldocs[thisdocidx]
307
                id = self._idprefix + '$i' + str(thisdocidx)
308
                e = rcldoctoentry(id, pid, self._httphp, self._pprefix, doc)
309
                if e:
310
                    entries.append(e)
311
312
        return sorted(entries, cmp=cmpentries)
313
314
    # Return path for objid, which has to be a container.This is good old
315
    # pwd... It is called from the search module for generating a 'dir:'
316
    # recoll filtering directive.
317
    def dirpath(self, objid):
318
        # We may get called from search, on the top dir (above
319
        # [folders]). Return empty in this case
121
        try:
320
        try:
122
            decoded = url.decode('utf-8')
321
            diridx = self._objidtodiridx(objid)
123
        except:
322
        except:
124
            decoded = urllib.quote(url).decode('utf-8')
323
            return ""
125
324
126
        # Determine the root entry (topdirs element). Special because
127
        # its path is not a simple name.
128
        fathidx = -1
129
        for rtpath,idx in dirvec[0].iteritems():
130
            if url.startswith(rtpath):
131
                fathidx = idx[0]
132
                break
133
        if fathidx == -1:
325
        if diridx == 0:
134
            uplog("No parent in topdirs: %s" % decoded)
326
            return "/"
135
            continue
136
137
        # Compute rest of path
138
        url1 = url[len(rtpath):]
139
        if len(url1) == 0:
140
            continue
141
142
        # If there is a contentgroup field, just add it as a virtual
143
        # directory in the path. This only affects the visible tree,
144
        # not the 'real' URLs of course.
145
        if doc.contentgroup:
146
            a = os.path.dirname(url1).decode('utf-8', errors='replace')
147
            b = os.path.basename(url1).decode('utf-8', errors='replace')
148
            url1 = os.path.join(a, doc.contentgroup, b)
149
            
150
        # Split path, then walk the vector, possibly creating
151
        # directory entries as needed
152
        path = url1.split('/')[1:]
153
        #uplog("%s"%path, file=sys.stderr)
154
        for idx in range(len(path)):
155
            elt = path[idx]
156
            if elt in dirvec[fathidx]:
157
                # This path element was already seen
158
                # If this is the last entry in the path, maybe update
159
                # the doc idx (previous entries were created for
160
                # intermediate elements without a Doc).
161
                if idx == len(path) -1:
162
                    dirvec[fathidx][elt] = (dirvec[fathidx][elt][0], docidx)
163
                    #uplog("updating docidx for %s" % decoded)
164
                # Update fathidx for next iteration
165
                fathidx = dirvec[fathidx][elt][0]
166
            else:
167
                # Element has no entry in father directory (hence no
168
                # dirvec entry either).
169
                if idx != len(path) -1:
170
                    # This is an intermediate element. Create a
171
                    # Doc-less directory
172
                    fathidx = _createdir(dirvec, fathidx, -1, elt)
173
                else:
174
                    # Last element. If directory, needs a dirvec entry
175
                    if doc.mtype == 'inode/directory':
176
                        fathidx = _createdir(dirvec, fathidx, docidx, elt)
177
                        #uplog("Setting docidx for %s" % decoded)
178
                    else:
179
                        dirvec[fathidx][elt] = (-1, docidx)
180
181
    if False:
182
        for ent in dirvec:
183
            uplog("%s" % ent)
184
185
    end = timer()
186
    uplog("_rcl2folders took %.2f Seconds" % (end - start))
187
    return dirvec
188
189
# Fetch all the docs by querying Recoll with [mime:*], which is
190
# guaranteed to match every doc without overflowing the query size
191
# (because the number of mime types is limited). Something like
192
# title:* would overflow. This creates the main doc array, which is
193
# then used by all modules.
194
def _fetchalldocs(confdir):
195
    start = timer()
196
    allthedocs = []
197
198
    rcldb = recoll.connect(confdir=confdir)
199
    rclq = rcldb.query()
200
    rclq.execute("mime:*", stemming=0)
201
    uplog("Estimated alldocs query results: %d" % (rclq.rowcount))
202
203
    totcnt = 0
204
    while True:
205
        docs = rclq.fetchmany()
206
        for doc in docs:
207
            allthedocs.append(doc)
208
            totcnt += 1
209
        if (_maxrclcnt > 0 and totcnt >= _maxrclcnt) or \
210
               len(docs) != rclq.arraysize:
211
            break
212
        time.sleep(0)
213
    end = timer()
214
    uplog("Retrieved %d docs in %.2f Seconds" % (totcnt,end - start))
215
    return allthedocs
216
217
218
# Initialize (read recoll data and build tree). This is called by
219
# uprcl-app init
220
def inittree(confdir, httphp, pathprefix):
221
    global g_alldocs, _dirvec
222
    
327
    
223
    g_alldocs = _fetchalldocs(confdir)
328
        lpath = []
224
    _dirvec = _rcl2folders(g_alldocs, confdir, httphp, pathprefix)
329
        while True:
225
    return g_alldocs
330
            fathidx = self._dirvec[diridx][".."][0]
226
227
228
229
##############
230
# Browsing the initialized [folders] hierarchy
231
232
233
# Extract dirvec index from objid, according to the way we generate them.
234
def _objidtodiridx(pid):
235
    if not pid.startswith(_foldersIdPfx):
236
        raise Exception("folders.browse: bad pid %s" % pid)
237
238
    if len(g_alldocs) == 0:
239
        raise Exception("folders:browse: no docs")
240
241
    diridx = pid[len(_foldersIdPfx):]
242
    if not diridx:
243
        diridx = 0
244
    else:
245
        if diridx[1] != 'd':
246
            raise Exception("folders:browse: called on non dir objid %s" % pid)
247
        diridx = int(diridx[2:])
248
    
249
    if diridx >= len(_dirvec):
250
        raise Exception("folders:browse: bad pid %s" % pid)
251
252
    return diridx
253
254
255
# Tell the top module what entries we define in the root
256
def rootentries(pid):
257
    return [rcldirentry(pid + 'folders', pid, '[folders]'),]
258
259
260
# Look all non-directory docs inside directory, and return the cover
261
# art we find.
262
def _arturifordir(diridx):
263
    for nm,ids in _dirvec[diridx].iteritems():
331
            for nm, ids in self._dirvec[fathidx].iteritems():
264
        if ids[1] >= 0:
332
                if ids[0] == diridx:
265
            doc = g_alldocs[ids[1]]
333
                    lpath.append(nm)
266
            if doc.mtype != 'inode/directory' and doc.albumarturi:
334
                    break
267
                return doc.albumarturi
335
                diridx = fathidx
268
              
336
                if diridx == 0:
337
                    break
269
338
270
# Folder hierarchy browse method.
339
        if not lpath:
271
# objid is like folders$index
340
            path = "/"
272
# flag is meta or children.
273
# httphp and pathprefix are used to generate URIs
274
def browse(pid, flag, httphp, pathprefix):
275
276
    diridx = _objidtodiridx(pid)
277
278
    # If there is only one entry in root, skip it. This means that 0
279
    # and 1 point to the same dir, but this does not seem to be an
280
    # issue
281
    if diridx == 0 and len(dirvec[0]) == 2:
282
        diridx = 1
283
        
284
    entries = []
285
286
    # The basename call is just for diridx==0 (topdirs). Remove it if
287
    # this proves a performance issue
288
    for nm,ids in _dirvec[diridx].iteritems():
289
        if nm == "..":
290
            continue
291
        thisdiridx = ids[0]
292
        thisdocidx = ids[1]
293
        if thisdocidx >= 0:
294
            doc = g_alldocs[thisdocidx]
295
        else:
341
        else:
296
            uplog("No doc for %s" % pid)
297
            doc = None
298
            
299
        if thisdiridx >= 0:
300
            # Skip empty directories
301
            if len(dirvec[thisdiridx]) == 1:
302
                continue
303
            id = _foldersIdPfx + '$' + 'd' + str(thisdiridx)
304
            if doc and doc.albumarturi:
305
                arturi = doc.albumarturi
306
            else:
307
                arturi = _arturifordir(thisdiridx)
308
            entries.append(rcldirentry(id, pid, os.path.basename(nm),
309
                                       arturi=arturi))
310
        else:
311
            # Not a directory. docidx had better been set
312
            if thisdocidx == -1:
313
                uplog("folders:docidx -1 for non-dir entry %s"%nm)
314
                continue
315
            doc = g_alldocs[thisdocidx]
316
            id = _foldersIdPfx + '$i' + str(thisdocidx)
317
            e = rcldoctoentry(id, pid, httphp, pathprefix, doc)
318
            if e:
319
                entries.append(e)
320
321
    return sorted(entries, cmp=cmpentries)
322
323
# Return path for objid, which has to be a container.This is good old
324
# pwd... It is called from the search module for generating a 'dir:'
325
# recoll filtering directive.
326
def dirpath(objid):
327
    # We may get called from search, on the top dir (above [folders]). Return
328
    # empty in this case
329
    try:
330
        diridx = _objidtodiridx(objid)
331
    except:
332
        return ""
333
334
    if diridx == 0:
335
        return "/"
336
    
337
    lpath = []
338
    while True:
339
        fathidx = _dirvec[diridx][".."][0]
340
        for nm, ids in _dirvec[fathidx].iteritems():
341
            if ids[0] == diridx:
342
                lpath.append(nm)
343
                break
344
        diridx = fathidx
345
        if diridx == 0:
346
            break
347
348
    if not lpath:
349
        path = "/"
350
    else:
351
        path = ""
342
            path = ""
352
    for elt in reversed(lpath):
343
        for elt in reversed(lpath):
353
        path += elt + "/"
344
            path += elt + "/"
354
345
355
    return path
346
        return path