Parent: [9eb77d] (diff)

Child: [64bf01] (diff)

Download this file

recollq.py    126 lines (107 with data), 3.7 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#!/usr/bin/env python
"""A python version of the command line query tool recollq (a bit simplified)
The input string is always interpreted as a query language string.
This could actually be useful for something after some customization
"""
import sys
from getopt import getopt
try:
from recoll import recoll
from recoll import rclextract
hasextract = True
except:
import recoll
hasextract = False
allmeta = ("title", "keywords", "abstract", "url", "mimetype", "mtime",
"ipath", "fbytes", "dbytes", "relevancyrating")
def Usage():
print >> sys.stderr, "Usage: recollq.py [-c conf] [-i extra_index] <recoll query>"
sys.exit(1);
class ptrmeths:
def __init__(self, groups):
self.groups = groups
def startMatch(self, idx):
ugroup = " ".join(self.groups[idx][1])
return '<span class="pyrclstart" idx="%d" ugroup="%s">' % (idx, ugroup)
def endMatch(self):
return '</span>'
def extract(doc):
extractor = rclextract.Extractor(doc)
newdoc = extractor.textextract(doc.ipath)
return newdoc
def extractofile(doc, outfilename=""):
extractor = rclextract.Extractor(doc)
outfilename = extractor.idoctofile(doc.ipath, doc.mimetype, \
ofilename=outfilename)
return outfilename
def doquery(db, q):
# Get query object
query = db.query()
#query.sortby("dmtime", ascending=True)
# Parse/run input query string
nres = query.execute(q, stemming = 0, stemlang="english")
qs = u"Xapian query: [%s]" % query.getxquery()
print(qs.encode("utf-8"))
groups = query.getgroups()
print "Groups:", groups
m = ptrmeths(groups)
# Print results:
print "Result count: ", nres, query.rowcount
if nres > 20:
nres = 20
#results = query.fetchmany(nres)
#for doc in results:
for i in range(nres):
doc = query.fetchone()
rownum = query.next if type(query.next) == int else \
query.rownumber
print rownum, ":",
#for k,v in doc.items().items():
#print "KEY:", k.encode('utf-8'), "VALUE", v.encode('utf-8')
#continue
#outfile = extractofile(doc)
#print "outfile:", outfile, "url", doc.url.encode("utf-8")
for k in ("title", "mtime", "author"):
value = getattr(doc, k)
# value = doc.get(k)
if value is None:
print k, ":", "(None)"
else:
print k, ":", value.encode('utf-8')
#doc.setbinurl(bytearray("toto"))
#burl = doc.getbinurl(); print "Bin URL :", doc.getbinurl()
abs = query.makedocabstract(doc, methods=m)
print abs.encode('utf-8')
print
# fulldoc = extract(doc)
# print "FULLDOC MIMETYPE", fulldoc.mimetype, "TEXT:", fulldoc.text.encode("utf-8")
########################################### MAIN
if len(sys.argv) < 2:
Usage()
confdir=""
extra_dbs = []
# Snippet params
maxchars = 120
contextwords = 4
# Process options: [-c confdir] [-i extra_db [-i extra_db] ...]
options, args = getopt(sys.argv[1:], "c:i:")
for opt,val in options:
if opt == "-c":
confdir = val
elif opt == "-i":
extra_dbs.append(val)
else:
print >> sys.stderr, "Bad opt: ", opt
Usage()
# The query should be in the remaining arg(s)
if len(args) == 0:
print >> sys.stderr, "No query found in command line"
Usage()
q = ""
for word in args:
q += word + " "
print "QUERY: [", q, "]"
db = recoll.connect(confdir=confdir,
extra_dbs=extra_dbs)
db.setAbstractParams(maxchars=maxchars, contextwords=contextwords)
doquery(db, q)