Download this file

rcldlkp.py    118 lines (96 with data), 2.8 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#!/usr/bin/env python
__doc__ = """
An exemple indexer for an arbitrary multi-document file format.
Not supposed to run ''as-is'' or be really useful.
''Lookup'' notes file indexing
The file format has text notes separated by lines with a single '%' character
If the script is called with just the file name as an argument, it will
(re)index the contents.
If the script is called with second numeric argument, it will retrieve the
specified record and output it in html
"""
import os
import stat
import sys
import re
rclconf = "/Users/dockes/.recoll-dlkp"
def udi(docfile, numrec):
return docfile + "#" + str(numrec)
###############################################################
def index_rec(db, numrec, rec):
doc = recoll.Doc()
# url
doc.url = "file://" + docfile
# utf8fn
# ipath
doc.ipath = str(numrec)
# mimetype
doc.mimetype = "text/plain"
# mtime
# origcharset
# title
lines = rec.split("\n")
if len(lines) >= 2:
doc.title = unicode(lines[1], "iso-8859-1")
if len(doc.title.strip()) == 0 and len(lines) >= 3:
doc.title = unicode(lines[2], "iso-8859-1")
# keywords
# abstract
# author
# fbytes
doc.fbytes = str(fbytes)
# text
doc.text = unicode(rec, "iso-8859-1")
# dbytes
doc.dbytes = str(len(rec))
# sig
if numrec == 0:
doc.sig = str(fmtime)
db.addOrUpdate(udi(docfile, numrec), doc)
def output_rec(rec):
# Escape html
rec = unicode(rec, "iso-8859-1").encode("utf-8")
rec = rec.replace("<", "&lt;");
rec = rec.replace("&", "&amp;");
rec = rec.replace('"', "&dquot;");
print '<html><head>'
print '<meta http-equiv="Content-Type" content="text/html;charset=UTF-8">'
print '</head><body><pre>'
print rec
print '</pre></body></html>'
################################################################
def usage():
sys.stderr.write("Usage: rcldlkp.py <filename> [<recnum>]\n")
exit(1)
if len(sys.argv) < 2:
usage()
docfile = sys.argv[1]
if len(sys.argv) > 2:
targetnum = int(sys.argv[2])
else:
targetnum = None
#print docfile, targetnum
stdata = os.stat(docfile)
fmtime = stdata[stat.ST_MTIME]
fbytes = stdata[stat.ST_SIZE]
f = open(docfile, 'r')
if targetnum == None:
import recoll
db = recoll.connect(confdir=rclconf, writable=1)
if not db.needUpdate(udi(docfile, 0), str(fmtime)):
exit(0)
rec = ""
numrec = 1
for line in f:
if re.compile("^%[ \t]*").match(line):
if targetnum == None:
index_rec(db, numrec, rec)
elif targetnum == numrec:
output_rec(rec)
exit(0)
numrec += 1
rec = ""
else:
rec += line
if targetnum == None:
index_rec(db, 0, "")