|
a |
|
b/src/mediaserver/cdplugins/uprcl/uprcltags.py |
|
|
1 |
from __future__ import print_function
|
|
|
2 |
|
|
|
3 |
import sys
|
|
|
4 |
import sqlite3
|
|
|
5 |
from timeit import default_timer as timer
|
|
|
6 |
|
|
|
7 |
from uprclutils import *
|
|
|
8 |
|
|
|
9 |
sqconn = sqlite3.connect(':memory:')
|
|
|
10 |
#sqconn = sqlite3.connect('/tmp/tracks.sqlite')
|
|
|
11 |
|
|
|
12 |
# TBD All Artists, Orchestra, Group
|
|
|
13 |
#
|
|
|
14 |
# Maybe we'd actually need a 3rd value for the recoll field name, but
|
|
|
15 |
# it can be the same for the currently relevant fields.
|
|
|
16 |
tables = {'Artist' : 'artist',
|
|
|
17 |
'Date' : 'date',
|
|
|
18 |
'Genre' : 'genre',
|
|
|
19 |
'All Artists' : 'allartists',
|
|
|
20 |
'Composer' : 'composer',
|
|
|
21 |
'Conductor' : 'conductor',
|
|
|
22 |
'Orchestra' : 'orchestra',
|
|
|
23 |
'Group' : 'grouptb',
|
|
|
24 |
'Comment' : 'comment'
|
|
|
25 |
}
|
|
|
26 |
|
|
|
27 |
def createsqdb(conn):
|
|
|
28 |
c = conn.cursor()
|
|
|
29 |
try:
|
|
|
30 |
c.execute('''DROP TABLE albums''')
|
|
|
31 |
c.execute('''DROP TABLE tracks''')
|
|
|
32 |
except:
|
|
|
33 |
pass
|
|
|
34 |
c.execute(
|
|
|
35 |
'''CREATE TABLE albums
|
|
|
36 |
(albid INTEGER PRIMARY KEY, albtitle TEXT, albfolder TEXT)''')
|
|
|
37 |
|
|
|
38 |
tracksstmt = '''CREATE TABLE tracks
|
|
|
39 |
(docidx INT, albid INT, trackno INT, title TEXT'''
|
|
|
40 |
|
|
|
41 |
for tt,tb in tables.iteritems():
|
|
|
42 |
try:
|
|
|
43 |
c.execute('DROP TABLE ' + tb)
|
|
|
44 |
except:
|
|
|
45 |
pass
|
|
|
46 |
stmt = 'CREATE TABLE ' + tb + \
|
|
|
47 |
' (' + tb + '_id' + ' INTEGER PRIMARY KEY, value TEXT)'
|
|
|
48 |
c.execute(stmt)
|
|
|
49 |
tracksstmt += ',' + tb + '_id INT'
|
|
|
50 |
|
|
|
51 |
tracksstmt += ')'
|
|
|
52 |
c.execute(tracksstmt)
|
|
|
53 |
|
|
|
54 |
|
|
|
55 |
def auxtableinsert(sqconn, tb, value):
|
|
|
56 |
c = sqconn.cursor()
|
|
|
57 |
col = tb + '_id'
|
|
|
58 |
stmt = 'SELECT ' + col + ' FROM ' + tb + ' WHERE value = ?'
|
|
|
59 |
c.execute(stmt, (value,))
|
|
|
60 |
r = c.fetchone()
|
|
|
61 |
if r:
|
|
|
62 |
rowid = r[0]
|
|
|
63 |
else:
|
|
|
64 |
stmt = 'INSERT INTO ' + tb + '(value) VALUES(?)'
|
|
|
65 |
c.execute(stmt, (value,))
|
|
|
66 |
rowid = c.lastrowid
|
|
|
67 |
|
|
|
68 |
return rowid
|
|
|
69 |
|
|
|
70 |
g_alldocs = []
|
|
|
71 |
def recolltosql(docs):
|
|
|
72 |
global g_alldocs
|
|
|
73 |
g_alldocs = docs
|
|
|
74 |
|
|
|
75 |
createsqdb(sqconn)
|
|
|
76 |
|
|
|
77 |
c = sqconn.cursor()
|
|
|
78 |
maxcnt = 0
|
|
|
79 |
totcnt = 0
|
|
|
80 |
for docidx in range(len(docs)):
|
|
|
81 |
doc = docs[docidx]
|
|
|
82 |
totcnt += 1
|
|
|
83 |
album = getattr(doc, 'album', None)
|
|
|
84 |
if not album:
|
|
|
85 |
if doc.mtype != 'inode/directory' and \
|
|
|
86 |
doc.mtype != 'image/jpeg':
|
|
|
87 |
pass
|
|
|
88 |
#uplog("No album: mtype %s title %s" % (doc.mtype, doc.url))
|
|
|
89 |
continue
|
|
|
90 |
folder = docfolder(doc).decode('utf-8', errors = 'replace')
|
|
|
91 |
try:
|
|
|
92 |
l= doc.tracknumber.split('/')
|
|
|
93 |
trackno = int(l[0])
|
|
|
94 |
except:
|
|
|
95 |
trackno = 0
|
|
|
96 |
|
|
|
97 |
# Create album record if needed. There is probably a
|
|
|
98 |
# single-statement syntax for this
|
|
|
99 |
c.execute('''SELECT albid FROM albums
|
|
|
100 |
WHERE albtitle = ? AND albfolder = ?''', (album, folder))
|
|
|
101 |
r = c.fetchone()
|
|
|
102 |
if r:
|
|
|
103 |
albid = r[0]
|
|
|
104 |
else:
|
|
|
105 |
c.execute('''INSERT INTO albums(albtitle, albfolder)
|
|
|
106 |
VALUES (?,?)''', (album, folder))
|
|
|
107 |
albid = c.lastrowid
|
|
|
108 |
|
|
|
109 |
|
|
|
110 |
columns = 'docidx,albid,trackno,title'
|
|
|
111 |
values = [docidx, albid, trackno, doc.title]
|
|
|
112 |
placehold = '?,?,?,?'
|
|
|
113 |
for tt,tb in tables.iteritems():
|
|
|
114 |
value = getattr(doc, tb, None)
|
|
|
115 |
if not value:
|
|
|
116 |
continue
|
|
|
117 |
rowid = auxtableinsert(sqconn, tb, value)
|
|
|
118 |
columns += ',' + tb + '_id'
|
|
|
119 |
values.append(rowid)
|
|
|
120 |
placehold += ',?'
|
|
|
121 |
|
|
|
122 |
stmt = 'INSERT INTO tracks(' + columns + ') VALUES(' + placehold + ')'
|
|
|
123 |
c.execute(stmt, values)
|
|
|
124 |
#uplog(doc.title)
|
|
|
125 |
|
|
|
126 |
|
|
|
127 |
sqconn.commit()
|
|
|
128 |
uplog("recolltosql: processed %d docs" % totcnt)
|
|
|
129 |
|
|
|
130 |
def rootentries(pid):
|
|
|
131 |
entries = [rcldirentry(pid + 'albums', pid, 'albums'),
|
|
|
132 |
rcldirentry(pid + 'items', pid, 'items')]
|
|
|
133 |
for tt,tb in tables.iteritems():
|
|
|
134 |
entries.append(rcldirentry(pid + '=' + tt , pid, tt))
|
|
|
135 |
return entries
|
|
|
136 |
|
|
|
137 |
g_myprefix = '0$uprcl$'
|
|
|
138 |
|
|
|
139 |
def colid(col):
|
|
|
140 |
return col + '_id'
|
|
|
141 |
|
|
|
142 |
def analyzesubtree(sqconn, recs):
|
|
|
143 |
docids = ','.join([str(r[0]) for r in recs])
|
|
|
144 |
uplog("analyzesubtree, docids %s" % docids)
|
|
|
145 |
c1 = sqconn.cursor()
|
|
|
146 |
tags = []
|
|
|
147 |
for tt,tb in tables.iteritems():
|
|
|
148 |
stmt = 'SELECT COUNT(DISTINCT ' + colid(tb) + \
|
|
|
149 |
') FROM tracks WHERE docidx IN (' + docids + ')'
|
|
|
150 |
uplog("analyzesubtree: executing: <%s>" % stmt)
|
|
|
151 |
c1.execute(stmt)
|
|
|
152 |
for r in c1:
|
|
|
153 |
cnt = r[0]
|
|
|
154 |
uplog("Found %d distinct values for %s" % (cnt, tb))
|
|
|
155 |
if cnt > 1:
|
|
|
156 |
tags.append(tt)
|
|
|
157 |
return tags
|
|
|
158 |
|
|
|
159 |
def seltagsbrowse(pid, qpath, flag, httphp, pathprefix):
|
|
|
160 |
uplog("seltagsbrowse. qpath %s" % qpath)
|
|
|
161 |
qlen = len(qpath)
|
|
|
162 |
selwhat = ''
|
|
|
163 |
selwhere = ''
|
|
|
164 |
values = []
|
|
|
165 |
i = 0
|
|
|
166 |
while i < qlen:
|
|
|
167 |
elt = qpath[i]
|
|
|
168 |
if elt.startswith('='):
|
|
|
169 |
col = tables[elt[1:]]
|
|
|
170 |
selwhere = selwhere + ' AND ' if selwhere else ' WHERE '
|
|
|
171 |
if i == qlen - 1:
|
|
|
172 |
# We want to display all unique values for the column
|
|
|
173 |
# artist.artist_id, artist.value
|
|
|
174 |
selwhat = col + '.' + col + '_id, ' + col + '.value'
|
|
|
175 |
# tracks.artist_id = artist.artist_id
|
|
|
176 |
selwhere += 'tracks.'+ col + '_id = ' + col + '.' + col + '_id'
|
|
|
177 |
else:
|
|
|
178 |
# Look at the value specified for the =xx column
|
|
|
179 |
selwhat = 'tracks.docidx'
|
|
|
180 |
selwhere += 'tracks.' + col + '_id = ?'
|
|
|
181 |
i += 1
|
|
|
182 |
values.append(int(qpath[i]))
|
|
|
183 |
i += 1
|
|
|
184 |
|
|
|
185 |
c = sqconn.cursor()
|
|
|
186 |
#for r in c:
|
|
|
187 |
# uplog("selres: %s" % r)
|
|
|
188 |
entries = []
|
|
|
189 |
if selwhat == 'tracks.docidx':
|
|
|
190 |
# SELECT docidx FROM tracks
|
|
|
191 |
# WHERE col1_id = ? AND col2_id = ?
|
|
|
192 |
stmt = "SELECT docidx FROM tracks %s" % selwhere
|
|
|
193 |
uplog("seltagsbrowse: executing <%s> values %s" % (stmt, values))
|
|
|
194 |
c.execute(stmt, values)
|
|
|
195 |
recs = c.fetchall()
|
|
|
196 |
subqs = analyzesubtree(sqconn, recs)
|
|
|
197 |
if not subqs:
|
|
|
198 |
for r in recs:
|
|
|
199 |
docidx = r[0]
|
|
|
200 |
id = pid + '$*i' + str(docidx)
|
|
|
201 |
entries.append(rcldoctoentry(id, pid, httphp, pathprefix,
|
|
|
202 |
g_alldocs[docidx]))
|
|
|
203 |
else:
|
|
|
204 |
for tt in subqs:
|
|
|
205 |
id = pid + '$=' + tt
|
|
|
206 |
entries.append(rcldirentry(id, pid, tt))
|
|
|
207 |
else:
|
|
|
208 |
# SELECT col.value FROM tracks, col
|
|
|
209 |
# WHERE tracks.col_id = col.col_id
|
|
|
210 |
# GROUP BY tracks.col_id
|
|
|
211 |
# ORDER BY col.value
|
|
|
212 |
stmt = "SELECT " + selwhat + " FROM tracks, " + col + \
|
|
|
213 |
selwhere + \
|
|
|
214 |
" GROUP BY tracks." + col + '_id' + \
|
|
|
215 |
" ORDER BY value"
|
|
|
216 |
uplog("seltagsbrowse: executing <%s> values %s" % (stmt, values))
|
|
|
217 |
c.execute(stmt, values)
|
|
|
218 |
for r in c:
|
|
|
219 |
id = pid + '$' + str(r[0])
|
|
|
220 |
entries.append(rcldirentry(id, pid, r[1]))
|
|
|
221 |
return entries
|
|
|
222 |
|
|
|
223 |
def browse(pid, flag, httphp, pathprefix):
|
|
|
224 |
idpath = pid.replace(g_myprefix, '', 1)
|
|
|
225 |
entries = []
|
|
|
226 |
uplog('tags:browse: idpath <%s>' % idpath)
|
|
|
227 |
qpath = idpath.split('$')
|
|
|
228 |
c = sqconn.cursor()
|
|
|
229 |
if idpath.startswith('items'):
|
|
|
230 |
c.execute('''SELECT docidx FROM tracks ORDER BY title''')
|
|
|
231 |
for r in c:
|
|
|
232 |
docidx = r[0]
|
|
|
233 |
id = pid + '$*i' + str(docidx)
|
|
|
234 |
entries.append(rcldoctoentry(id, pid, httphp, pathprefix,
|
|
|
235 |
g_alldocs[docidx]))
|
|
|
236 |
elif idpath.startswith('albums'):
|
|
|
237 |
if len(qpath) == 1:
|
|
|
238 |
c.execute('''SELECT albid, albtitle FROM albums
|
|
|
239 |
ORDER BY albtitle''')
|
|
|
240 |
for r in c:
|
|
|
241 |
id = pid + '$*' + str(r[0])
|
|
|
242 |
entries.append(rcldirentry(id, pid, r[1]))
|
|
|
243 |
elif len(qpath) == 2:
|
|
|
244 |
e1 = qpath[1]
|
|
|
245 |
if not e1.startswith("*"):
|
|
|
246 |
raise Exception("Bad album id in albums tree. Pth: %s" %idpath)
|
|
|
247 |
albid = int(e1[1:])
|
|
|
248 |
c.execute('''SELECT docidx FROM tracks WHERE albid = ? ORDER BY
|
|
|
249 |
trackno''', (albid,))
|
|
|
250 |
for r in c:
|
|
|
251 |
docidx = r[0]
|
|
|
252 |
id = pid + '$*i' + str(docidx)
|
|
|
253 |
entries.append(rcldoctoentry(id, pid, httphp, pathprefix,
|
|
|
254 |
g_alldocs[docidx]))
|
|
|
255 |
else:
|
|
|
256 |
raise Exception("Bad path in album tree (too deep): <%s>"%idpath)
|
|
|
257 |
elif idpath.startswith('='):
|
|
|
258 |
entries = seltagsbrowse(pid, qpath, flag, httphp, pathprefix)
|
|
|
259 |
else:
|
|
|
260 |
raise Exception('Bad path in tags tree (start>):<%s>'%idpath)
|
|
|
261 |
return entries
|
|
|
262 |
|
|
|
263 |
|
|
|
264 |
def misctries():
|
|
|
265 |
c = sqconn.cursor()
|
|
|
266 |
c.execute('''SELECT COUNT(*) FROM tracks''')
|
|
|
267 |
uplog("Count(*) %d" % (c.fetchone()[0],))
|
|
|
268 |
|
|
|
269 |
#for row in c.execute('''SELECT album
|
|
|
270 |
# FROM tracks where artist LIKE "%Gould%"
|
|
|
271 |
# GROUP BY album'''):
|
|
|
272 |
# uplog("%s" % (row,))
|
|
|
273 |
|
|
|
274 |
# For some strange reason it appears that GROUP BY is faster than SELECT
|
|
|
275 |
# DISTINCT
|
|
|
276 |
stmt = '''SELECT album FROM tracks GROUP BY album ORDER BY album'''
|
|
|
277 |
start = timer()
|
|
|
278 |
for row in c.execute(stmt):
|
|
|
279 |
#uplog("%s" % (row[0].encode('UTF-8')))
|
|
|
280 |
pass
|
|
|
281 |
end = timer()
|
|
|
282 |
uplog("Select took %.2f Seconds" % (end - start))
|
|
|
283 |
for row in c.execute('''SELECT COUNT(DISTINCT album) from tracks'''):
|
|
|
284 |
uplog("Album count %d" % row[0])
|
|
|
285 |
|
|
|
286 |
|
|
|
287 |
if __name__ == '__main__':
|
|
|
288 |
confdir = "/home/dockes/.recoll-mp3"
|
|
|
289 |
from recoll import recoll
|
|
|
290 |
|
|
|
291 |
def fetchalldocs(confdir):
|
|
|
292 |
allthedocs = []
|
|
|
293 |
rcldb = recoll.connect(confdir=confdir)
|
|
|
294 |
rclq = rcldb.query()
|
|
|
295 |
rclq.execute("mime:*", stemming=0)
|
|
|
296 |
uplog("Estimated alldocs query results: %d" % (rclq.rowcount))
|
|
|
297 |
maxcnt = 1000
|
|
|
298 |
totcnt = 0
|
|
|
299 |
while True:
|
|
|
300 |
docs = rclq.fetchmany()
|
|
|
301 |
for doc in docs:
|
|
|
302 |
allthedocs.append(doc)
|
|
|
303 |
totcnt += 1
|
|
|
304 |
if (maxcnt > 0 and totcnt >= maxcnt) or \
|
|
|
305 |
len(docs) != rclq.arraysize:
|
|
|
306 |
break
|
|
|
307 |
uplog("Retrieved %d docs" % (totcnt,))
|
|
|
308 |
return allthedocs
|
|
|
309 |
|
|
|
310 |
start = timer()
|
|
|
311 |
docs = fetchalldocs(confdir)
|
|
|
312 |
end = timer()
|
|
|
313 |
uplog("Recoll extract took %.2f Seconds" % (end - start))
|
|
|
314 |
start = timer()
|
|
|
315 |
recolltosql(docs)
|
|
|
316 |
end = timer()
|
|
|
317 |
uplog("SQL db create took %.2f Seconds" % (end - start))
|
|
|
318 |
|