Download this file

docdups.py    117 lines (95 with data), 3.1 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#!/usr/bin/env python
import sys
import xapian
o_index_stripchars = True
md5wpref = "XM"
# Handle caps/diac-stripping option. If the db is raw the prefixes are
# wrapped with ":"
def wrap_prefix(prefix):
if o_index_stripchars:
return prefix
else:
return ":" + prefix + ":"
def init_stripchars(xdb):
global o_index_stripchars
global md5wpref
t = xdb.allterms()
t.skip_to(":")
for term in t:
if term.term.find(":") == 0:
o_index_stripchars = False
break
md5wpref = wrap_prefix("XM")
# Retrieve named value from document data record.
# The record format is a sequence of nm=value lines
def get_attribute(xdb, docid, fld):
doc = xdb.get_document(docid)
data = doc.get_data()
s = data.find(fld+"=")
if s == -1:
return ""
e = data.find("\n", s)
return data[s+len(fld)+1:e]
# Convenience: retrieve postings as Python list
def get_postlist(xdb, term):
ret = list()
for posting in xdb.postlist(term):
ret.append(posting.docid)
return ret
# Return list of docids having same md5 including self
def get_dups(xdb, docid):
doc = xdb.get_document(int(docid))
# It would be more efficient to retrieve the value, but it's
# binary so we'd have to decode it
md5term = doc.termlist().skip_to(md5wpref).term
if not md5term.startswith(md5wpref):
return
posts = get_postlist(xdb, md5term)
return posts
# Retrieve all sets of duplicates:
# walk the list of all MD5 terms, look up their posting lists, and
# store the docids where the list is longer than one.
def find_all_dups(xdb):
alldups = list()
# Walk the MD5 terms
t = xdb.allterms()
t.skip_to(md5wpref)
for term in t:
if not term.term.startswith(md5wpref):
break
# Check postlist for term, if it's not of length 1, we have a dup
dups = get_postlist(xdb, term.term)
if len(dups) != 1:
alldups.append(dups)
return alldups
# Print docid url ipath for list of docids
def print_urlipath(xdb, doclist):
for docid in doclist:
url = get_attribute(xdb, docid, "url")
ipath = get_attribute(xdb, docid, "ipath")
print docid, url, ipath
########## Main program
if len(sys.argv) < 2:
print >> sys.stderr, "Usage: %s /path/to/db [docid [docid ...]]" % \
sys.argv[0]
print >> sys.stderr, " will print all sets of dups if no docid is given"
print >> sys.stderr, " else only the duplicates for the given docids"
sys.exit(1)
xdbpath = sys.argv[1]
xdb = xapian.Database(xdbpath)
init_stripchars(xdb)
try:
if len(sys.argv) == 2:
# No docid args,
alldups = find_all_dups(xdb)
for dups in alldups:
print_urlipath(xdb, dups)
print
else:
for docid in sys.argv[2:]:
dups = get_dups(xdb, docid)
if dups is not None and len(dups) > 1:
print_urlipath(xdb, dups)
except Exception, e:
print >> sys.stderr, "Xapian error: %s" % str(e)
sys.exit(1)