upmpdcli / Code / [942467] /src/mediaserver/cdplugins/uprcl/uprcltags.py

[942467]: src / mediaserver / cdplugins / uprcl / uprcltags.py History

uprcltags.py 597 lines (514 with data), 22.6 kB

#
# Copyright (C) 2017 J.F.Dockes
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

import sys
PY3 = sys.version > '3'
import os
import sqlite3
import time
import tempfile
from timeit import default_timer as timer

from upmplgutils import uplog
from uprclutils import g_myprefix, audiomtypes, docfolder, \
     rcldirentry, rcldoctoentry, cmpentries

# Tags for which we create auxiliary tables for facet descent.
#
# TBD: The list will come from the config file one day
# TBD: alias et al configuration
#
# TBD: All Artists
#
# The key is the presentation name (container title). The value is the
# auxiliary table name, used also as base for unique id and join
# columns (with _id) appended, and is also currently the recoll field
# name (with a provision to differ if needed, thanks to the
# currently empty _coltorclfield dict).
_tagtotable = {
    'Artist' : 'artist',
    'Date' : 'date',
    'Genre' : 'genre',
#   'All Artists' : 'allartists',
    'Composer' : 'composer',
    'Conductor' : 'conductor',
    'Orchestra' : 'orchestra',
    'Group' : 'contentgroup',
    'Comment' : 'comment'
    }

def _clid(table):
    return table + '_id'

# Translation only used when fetching fields from the recoll
# record. None at the moment
_coltorclfield = {
    }



# The browseable object which defines the tree of tracks organized by tags.
class Tagged(object):
    def __init__(self, docs, httphp, pathprefix):
        self._httphp = httphp
        self._pprefix = pathprefix
        self._conn = None
        self._rcldocs = docs
        self._init_sqconn()
        # Compute an array of (table name, recoll field) translations. Most
        # often they are identical.
        self._tabtorclfield = []
        for tb in _tagtotable.values():
            if tb in _coltorclfield:
                rclfld = _coltorclfield[tb]
            else:
                rclfld = tb
            self._tabtorclfield.append((tb, rclfld))
        self._recolltosql(docs)
        

    def _init_sqconn(self):
        # We use a separate thread for building the db to ensure
        # responsiveness during this phase.  :memory: handles normally
        # can't be shared between threads, and different :memory: handles
        # access different dbs. The following would work, but it needs
        # python 3.4
        #self._conn = sqlite3.connect('file:uprcl_db?mode=memory&cache=shared')
        # As we can guarantee that 2 threads will never access the db at
        # the same time (the init thread just goes away when it's done),
        # we just disable the same_thread checking on :memory:
        if self._conn is None:
            self._conn = sqlite3.connect(':memory:', check_same_thread=False)


    # Create the db. Each tag table has 2 columns: <tagname>_id and
    # value. The join column in the main tracks table is also named
    # <tagname>_id
    def _createsqdb(self):
        c = self._conn.cursor()
        try:
            c.execute('''DROP TABLE albums''')
            c.execute('''DROP TABLE tracks''')
        except:
            pass
        c.execute(
            '''CREATE TABLE albums (album_id INTEGER PRIMARY KEY, artist_id INT,
            albtitle TEXT, albfolder TEXT, albdate TEXT, albarturi TEXT)''')

        tracksstmt = '''CREATE TABLE tracks
        (docidx INT, album_id INT, trackno INT, title TEXT'''

        for tb in _tagtotable.values():
            try:
                c.execute('DROP TABLE ' + tb)
            except:
                pass
            stmt = 'CREATE TABLE ' + tb + \
               ' (' + _clid(tb) + ' INTEGER PRIMARY KEY, value TEXT)'
            c.execute(stmt)
            tracksstmt += ',' + _clid(tb) + ' INT'

        tracksstmt += ')'
        c.execute(tracksstmt)
    

    # Insert new value if not existing, return rowid of new or existing row
    def _auxtableinsert(self, tb, value):
        c = self._conn.cursor()
        stmt = 'SELECT ' + _clid(tb) + ' FROM ' + tb + ' WHERE value = ?'
        c.execute(stmt, (value,))
        r = c.fetchone()
        if r:
            rowid = r[0]
        else:
            stmt = 'INSERT INTO ' + tb + '(value) VALUES(?)'
            c.execute(stmt, (value,))
            rowid = c.lastrowid
        return rowid


    # tracknos like n/max are now supposedly processed by rclaudio and
    # should not arrive here, but let's play it safe.
    def _tracknofordoc(self, doc):
        try:
            return int(doc.tracknumber.split('/')[0])
        except:
            return 0

    # Create album record if needed.
    # The albums table is special, can't use auxtableinsert()
    def _maybecreatealbum(self, c, doc):
        folder = docfolder(doc).decode('utf-8', errors = 'replace')
        album = getattr(doc, 'album', None)
        if not album:
            album = os.path.basename(folder)
            #uplog("Using %s for alb: mime %s title %s" %
            #(album,doc.mtype, doc.url))
        if doc.albumartist:
            albartist_id = self._auxtableinsert('artist', doc.albumartist)
        else:
            albartist_id = None
        c.execute('''SELECT album_id, artist_id FROM albums
        WHERE albtitle = ? AND albfolder = ?''', (album, folder))
        r = c.fetchone()
        if r:
            album_id = r[0]
            albartist_id = r[1]
        else:
            c.execute('''INSERT INTO albums(albtitle, albfolder, artist_id,
            albdate, albarturi)
            VALUES (?,?,?,?,?)''', (album, folder, albartist_id, doc.date,
                                    doc.albumarturi))
            album_id = c.lastrowid
        return album_id, albartist_id


    # Create the db and fill it up with the values we need, taken out of
    # the recoll records list
    def _recolltosql(self, docs):
        start = timer()

        self._createsqdb()

        c = self._conn.cursor()
        maxcnt = 0
        totcnt = 0
        for docidx in range(len(docs)):
            doc = docs[docidx]
            totcnt += 1

            if totcnt % 1000 == 0:
                time.sleep(0)
            
            # No need to include non-audio types in the visible tree.
            if doc.mtype not in audiomtypes or doc.mtype == 'inode/directory':
                continue

            album_id, albartist_id = self._maybecreatealbum(c, doc)
            
            trackno = self._tracknofordoc(doc)
            
            # Set base values for column names, values list,
            # placeholders
            columns = 'docidx,album_id,trackno,title'
            values = [docidx, album_id, trackno, doc.title]
            placehold = '?,?,?,?'
            # Append data for each auxiliary table if the doc has a value
            # for the corresponding field (else let SQL set a dflt/null value)
            for tb, rclfld in self._tabtorclfield:
                value = getattr(doc, rclfld, None)
                if not value:
                    continue
                rowid = self._auxtableinsert(tb, value)
                columns += ',' + _clid(tb)
                values.append(rowid)
                placehold += ',?'

            # Create the main record in the tracks table.
            stmt='INSERT INTO tracks(' + columns + ') VALUES(' + placehold + ')'
            c.execute(stmt, values)
            #uplog(doc.title)

            # If the album had no artist yet, set it from the track
            # artist.  Means that if tracks for the same album have
            # different artist values, we arbitrarily use the first
            # one.
            if not albartist_id:
                lcols = columns.split(',')
                try:
                    i = lcols.index('artist_id')
                    artist_id = values[i]
                    stmt = 'UPDATE albums SET artist_id = ? WHERE album_id = ?'
                    c.execute(stmt, (artist_id, album_id))
                except:
                    pass
        ## End Big doc loop
        
        self._conn.commit()
        end = timer()
        uplog("recolltosql: processed %d docs in %.2f Seconds" %
              (totcnt, end-start))


    # Create our top-level directories, with fixed entries, and stuff from
    # the tags tables
    def rootentries(self, pid):
        c = self._conn.cursor()
        c.execute("SELECT COUNT(*) from albums")
        nalbs = str(c.fetchone()[0])
        c.execute("SELECT COUNT(*) from tracks")
        nitems = str(c.fetchone()[0])
        entries = [rcldirentry(pid + 'albums', pid, nalbs + ' albums'),
                   rcldirentry(pid + 'items', pid, nitems + ' items')]
        for tt in sorted(_tagtotable.keys()):
            entries.append(rcldirentry(pid + '=' + tt , pid, tt))
        return entries


    # Return the list of tags which have multiple values inside the
    # input list of docids.
    def _subtreetags(self, docidsl):
        docids = ','.join([str(i) for i in docidsl])
        uplog("subtreetags, docids %s" % docids)
        c = self._conn.cursor()
        tags = []
        for tt,tb in _tagtotable.items():
            stmt = 'SELECT COUNT(DISTINCT ' + _clid(tb) + \
                   ') FROM tracks WHERE docidx IN (' + docids + ')'
            uplog("subtreetags: executing: <%s>" % stmt)
            c.execute(stmt)
            cnt = c.fetchone()[0]
            uplog("Found %d distinct values for %s" % (cnt, tb))
            if cnt > 1:
                tags.append(tt)
        return tags


    # Build a list of track directory entries for an SQL statement
    # which selects docidxs (SELECT docidx,... FROM tracks WHERE...)
    def _trackentriesforstmt(self, stmt, values, pid):
        c = self._conn.cursor()
        c.execute(stmt, values)
        entries = [rcldoctoentry(pid + '$i' + str(r[0]),
                                 pid, self._httphp, self._pprefix,
                                 self._rcldocs[r[0]]) for r in c]
        if PY3:
            return sorted(entries, key=cmpentries)
        else:
            return sorted(entries, cmp=cmpentries)
    

    # Return a list of trackids as selected by the current
    # path <selwhere> is like: WHERE col1_id = ? AND col2_id = ? [...], and
    # <values> holds the corresponding values
    def _docidsforsel(self, selwhere, values):
        c = self._conn.cursor()
        stmt = 'SELECT docidx FROM tracks ' + selwhere + ' ORDER BY trackno'
        uplog("docidsforsel: executing <%s> values %s" % (stmt, values))
        c.execute(stmt, values)
        return [r[0] for r in c.fetchall()]


    def _trackentriesforalbum(self, albid, pid):
        stmt = 'SELECT docidx FROM tracks WHERE album_id = ? ORDER BY trackno'
        return self._trackentriesforstmt(stmt, (albid,), pid)


    def _direntriesforalbums(self, pid, where):
        entries = []
        c = self._conn.cursor()
        if not where:
            where = ' LEFT JOIN artist ON artist.artist_id = albums.artist_id '
            frm = ' FROM albums '
        else:
            where += ' AND artist.artist_id = albums.artist_id '
            frm = ' FROM albums,artist '

        stmt = 'SELECT album_id, albtitle, albarturi, albdate, artist.value' + \
               frm + where + ' ORDER BY albtitle'
        uplog('direntriesforalbums: %s' % stmt)
        c.execute(stmt)
        for r in c:
            id = pid + '$' + str(r[0])
            entries.append(
                rcldirentry(id, pid, r[1], arturi=r[2], date=r[3],artist=r[4], 
                            upnpclass='object.container.album.musicAlbum'))
        return entries


    # This is called when an 'albums' element is encountered in the
    # selection path. i is the index of the albums element. The tree under
    # albums has a well defined structure: ql=len(qpath), we have an
    # albums list if i is the last element (i == ql-1), an album track
    # list for i == ql-2 (we then have an albid at ql-1), and a 'Complete
    # album' query if i == ql-3 (...$albums$xxx$showca)
    def _tagsbrowsealbums(self, pid, qpath, i, selwhere, values):
        uplog("_tagsbrowsealbums: pid %s qpath %s i %s selwhere %s values %s" %
              (pid, qpath, i, selwhere, values))
        c = self._conn.cursor()
        docidsl = self._docidsforsel(selwhere, values)
        entries = []
        if i == len(qpath)-1:
            albidsl = self._subtreealbums(docidsl)
            albids = ','.join([str(a) for a in albidsl])
            where = ' WHERE album_id in (' + albids + ') '
            entries = self._direntriesforalbums(pid, where)
        elif i == len(qpath)-2:
            albid = int(qpath[-1])
            docids = ','.join([str(i) for i in docidsl])
            stmt = 'SELECT COUNT(docidx) FROM tracks WHERE album_id = ?'
            c.execute(stmt, (albid,))
            r = c.fetchone()
            ntracks = int(r[0])
            stmt = 'SELECT docidx FROM tracks ' + \
                   'WHERE album_id = ? AND docidx IN ('+ docids + ')'
            entries = self._trackentriesforstmt(stmt, (albid,), pid)
            if ntracks != len(entries):
                id = pid + '$' + 'showca'
                entries = [rcldirentry(id, pid, '>> Complete Album')] + entries
        elif i == len(qpath)-3:
            # Note that minim has an additional level here, probably to
            # present groups or multiple groups ? The trackids ids are
            # like: 
            #    0$=Composer$17738$albums$2$showca.0$hcalbum$*i13458
            # I don't know what the .0 is for.
            # The 'hcalbum' level usually has 2 entries '>> Hide Content' 
            # and the album title. TBD
            albid = int(qpath[-2])
            entries = self._trackentriesforalbum(albid, pid)
        
        return entries


    # This is called when an 'items' element is encountered in the
    # selection path. We just list the selected tracks
    # TBD: need Complete Album here too ?
    def _tagsbrowseitems(self, pid, qpath, i, selwhere, values):
        stmt = 'SELECT docidx FROM tracks ' + selwhere
        return self._trackentriesforstmt(stmt, values, pid)


    # Return all albums ids to which any of the currently selected tracks
    # (designated by a docid set) belong
    def _subtreealbums(self, docidsl):
        docids = ','.join([str(r) for r in docidsl])
        albids = []
        stmt = 'SELECT album_id from tracks where docidx IN (' \
               + docids + ') ' + 'GROUP BY album_id'
        c = self._conn.cursor()
        uplog('subtreealbums: executing %s' % stmt)
        c.execute(stmt)
        for r in c:
            albids.append(r[0])
        uplog('subtreealbums: returning %s' % albids)
        return albids
    

    # Main browsing routine. Given an objid, translate it into a select
    # statement, plus further processing, and return the corresponding
    # records
    def _tagsbrowse(self, pid, qpath, flag):
        uplog("tagsbrowse. pid %s qpath %s" % (pid, qpath))
        qlen = len(qpath)
        selwhat = ''
        selwhere = ''
        values = []
        i = 0
        while i < qlen:
            elt = qpath[i]

            # '=colname'. Set the current column name, which will be used
            # in different ways depending if this is the last element or
            # not.
            if elt.startswith('='):
                col = _tagtotable[elt[1:]] 

            # Detect the special values: albums items etc. here. Their
            # presence changes how we process the rest (showing tracks and
            # albums and not dealing with other tags any more)
            if elt == 'albums':
                return self._tagsbrowsealbums(pid, qpath, i, selwhere, values)
            elif elt == 'items':
                return self._tagsbrowseitems(pid, qpath, i, selwhere, values)
            
            selwhere = selwhere + ' AND ' if selwhere else ' WHERE '
            if i == qlen - 1:
                # We want to display all unique values for the column
                # artist.artist_id, artist.value
                selwhat = col + '.' + _clid(col) + ', ' + col + '.value'
                # tracks.artist_id = artist.artist_id
                selwhere += 'tracks.' + _clid(col) + ' = ' + col + \
                            '.' + _clid(col)
            else:
                # Look at the value specified for the =xx column. The
                # selwhat value is only used as a flag
                selwhat = 'tracks.docidx'
                selwhere += 'tracks.' + _clid(col) + ' =  ?'
                i += 1
                values.append(int(qpath[i]))
            i += 1
            

        # TBD: Need a ">> Complete Album" entry if there is a single
        # album, no subqs and not all the tracks are listed
        entries = []
        if selwhat == 'tracks.docidx':
            docids = self._docidsforsel(selwhere, values)
            albids = self._subtreealbums(docids)
            subqs = self._subtreetags(docids)
            if len(albids) > 1:
                id = pid + '$albums'
                entries.append(rcldirentry(id, pid, str(len(albids)) +
                                           ' albums'))
                if subqs:
                    id = pid + '$items'
                    entries.append(rcldirentry(id,pid, str(len(docids)) +
                                               ' items'))
            elif len(albids) == 1 and subqs:
                id = pid + '$items'
                entries.append(rcldirentry(id,pid, str(len(docids)) + ' items'))

            if not subqs:
                for docidx in docids:
                    id = pid + '$*i' + str(docidx)
                    entries.append(rcldoctoentry(id, pid, self._httphp,
                                                 self._pprefix,
                                                 self._rcldocs[docidx]))
                    if PY3:
                        entries = sorted(entries, key=cmpentries)
                    else:
                        entries = sorted(entries, cmp=cmpentries)
            else:
                for tt in subqs:
                    id = pid + '$=' + tt
                    entries.append(rcldirentry(id, pid, tt))
        else:
            # SELECT col.value FROM tracks, col
            # WHERE tracks.col_id = col.col_id
            # GROUP BY tracks.col_id
            # ORDER BY col.value
            stmt = "SELECT " + selwhat + " FROM tracks, " + col + \
                   selwhere + \
                   " GROUP BY tracks." + _clid(col) + \
                   " ORDER BY value"
            uplog("tagsbrowse: executing <%s> values %s" % (stmt,values))
            c = self._conn.cursor()
            c.execute(stmt, values)
            for r in c:
                id = pid + '$' + str(r[0])
                entries.append(rcldirentry(id, pid, r[1]))
        return entries


    # Browse the top-level tree named like 'xxx albums'. There are just 2
    # levels: the whole albums list, then for each entry the specified
    # albums track list
    def _albumsbrowse(self, pid, qpath, flag):
        c = self._conn.cursor()
        entries = []
        if len(qpath) == 1:
            entries = self._direntriesforalbums(pid, '')
        elif len(qpath) == 2:
            e1 = qpath[1]
            album_id = int(e1)
            entries = self._trackentriesforalbum(album_id, pid)
        else:
            raise Exception("Bad path in album tree (too deep): <%s>" % qpath)

        return entries


    # Top level browse routine. Handle the special cases and call the
    # appropriate worker routine.
    def browse(self, pid, flag):
        idpath = pid.replace(g_myprefix, '', 1)
        uplog('tags:browse: idpath <%s>' % idpath)
        entries = []
        qpath = idpath.split('$')
        if idpath.startswith('items'):
            stmt = 'SELECT docidx FROM tracks'
            entries = self._trackentriesforstmt(stmt, (), pid)
        elif idpath.startswith('albums'):
            entries = self._albumsbrowse(pid, qpath, flag)
        elif idpath.startswith('='):
            entries = self._tagsbrowse(pid, qpath, flag)
        else:
            raise Exception('Bad path in tags tree (start): <%s>' % idpath)
        return entries









############ Misc test/trial code, not used by uprcl ########################

def misctries():
    c = self._conn.cursor()
    c.execute('''SELECT COUNT(*) FROM tracks''')
    uplog("Count(*) %d" % (c.fetchone()[0],))
    
    #for row in c.execute('''SELECT album
    #                        FROM tracks where artist LIKE "%Gould%"
    #                        GROUP BY album'''):
    #    uplog("%s" % (row,))

    # For some strange reason it appears that GROUP BY is faster than SELECT
    # DISTINCT
    stmt = '''SELECT album FROM tracks GROUP BY album ORDER BY album'''
    start = timer()
    for row in c.execute(stmt):
        #uplog("%s" % (row[0].encode('UTF-8')))
        pass
    end = timer()
    uplog("Select took %.2f Seconds" % (end - start))
    for row in c.execute('''SELECT COUNT(DISTINCT album) from tracks'''):
        uplog("Album count %d" % row[0])


if __name__ == '__main__':
    confdir = "/home/dockes/.recoll-mp3"
    from recoll import recoll

    def fetchalldocs(confdir):
        allthedocs = []
        rcldb = recoll.connect(confdir=confdir)
        rclq = rcldb.query()
        rclq.execute("mime:*", stemming=0)
        uplog("Estimated alldocs query results: %d" % (rclq.rowcount))
        maxcnt = 1000
        totcnt = 0
        while True:
            docs = rclq.fetchmany()
            for doc in docs:
                allthedocs.append(doc)
                totcnt += 1
            if (maxcnt > 0 and totcnt >= maxcnt) or \
                   len(docs) != rclq.arraysize:
                break
        uplog("Retrieved %d docs" % (totcnt,))
        return allthedocs
    
    start = timer()
    docs = fetchalldocs(confdir)
    end = timer()
    uplog("Recoll extract took %.2f Seconds" % (end - start))
    start = timer()
    recolltosql(docs)
    end = timer()
    uplog("SQL db create took %.2f Seconds" % (end - start))