|
a/Allura/test-light.py |
|
b/Allura/test-light.py |
1 |
import sys
|
1 |
import sys
|
2 |
import logging
|
|
|
3 |
from collections import defaultdict
|
|
|
4 |
from itertools import chain, izip
|
|
|
5 |
from datetime import datetime
|
|
|
6 |
from cPickle import dumps
|
|
|
7 |
|
2 |
|
8 |
import bson
|
|
|
9 |
from pylons import c
|
3 |
from pylons import c
|
10 |
from pymongo.errors import DuplicateKeyError
|
|
|
11 |
|
|
|
12 |
from ming.base import Object
|
|
|
13 |
|
4 |
|
14 |
from allura.lib import helpers as h
|
5 |
from allura.lib import helpers as h
|
15 |
from allura.lib import utils
|
|
|
16 |
from allura.model.repo import CommitDoc, TreeDoc, TreesDoc, DiffInfoDoc
|
6 |
from allura.model.repo import CommitDoc, TreeDoc, TreesDoc, DiffInfoDoc
|
17 |
from allura.model.repo import LastCommitDoc, CommitRunDoc
|
7 |
from allura.model.repo import LastCommitDoc, CommitRunDoc
|
18 |
from allura.model.repo import Commit
|
8 |
from allura.model.repo_refresh import refresh_repo
|
19 |
from allura.model.index import ArtifactReferenceDoc, ShortlinkDoc
|
|
|
20 |
|
|
|
21 |
log = logging.getLogger(__name__)
|
|
|
22 |
|
|
|
23 |
QSIZE=100
|
|
|
24 |
|
9 |
|
25 |
def main():
|
10 |
def main():
|
26 |
if len(sys.argv) > 1:
|
11 |
if len(sys.argv) > 1:
|
27 |
h.set_context('test')
|
12 |
h.set_context('test')
|
28 |
c.project.install_app('Git', 'code', 'Code', init_from_url='/home/rick446/src/forge')
|
13 |
c.project.install_app('Git', 'code', 'Code', init_from_url='/home/rick446/src/forge')
|
29 |
h.set_context('test', 'code')
|
14 |
c.project.install_app('Hg', 'code2', 'Code2', init_from_url='/home/rick446/src/Kajiki')
|
30 |
CommitDoc.m.remove({})
|
15 |
CommitDoc.m.remove({})
|
31 |
TreeDoc.m.remove({})
|
16 |
TreeDoc.m.remove({})
|
32 |
TreesDoc.m.remove({})
|
17 |
TreesDoc.m.remove({})
|
33 |
DiffInfoDoc.m.remove({})
|
18 |
DiffInfoDoc.m.remove({})
|
34 |
LastCommitDoc.m.remove({})
|
19 |
LastCommitDoc.m.remove({})
|
35 |
CommitRunDoc.m.remove({})
|
20 |
CommitRunDoc.m.remove({})
|
36 |
|
21 |
|
37 |
# Get all commits (repo-specific)
|
22 |
h.set_context('test', 'code')
|
38 |
all_commit_ids = list(c.app.repo.all_commit_ids())
|
23 |
refresh_repo(c.app.repo, notify=False)
|
|
|
24 |
h.set_context('test', 'code2')
|
|
|
25 |
refresh_repo(c.app.repo, notify=False)
|
39 |
|
26 |
|
40 |
# Skip commits that are already in the DB (repo-agnostic)
|
|
|
41 |
commit_ids = unknown_commit_ids(all_commit_ids)
|
|
|
42 |
# commit_ids = commit_ids[:500]
|
|
|
43 |
log.info('Refreshing %d commits', len(commit_ids))
|
|
|
44 |
|
|
|
45 |
# Refresh commits (repo-specific)
|
|
|
46 |
seen = set()
|
|
|
47 |
for i, oid in enumerate(commit_ids):
|
|
|
48 |
c.app.repo.refresh_commit_info(oid, seen)
|
|
|
49 |
if (i+1) % 100 == 0:
|
|
|
50 |
log.info('Refresh commit info %d: %s', (i+1), oid)
|
|
|
51 |
|
|
|
52 |
#############################################
|
|
|
53 |
# Everything below here is repo-agnostic
|
|
|
54 |
#############################################
|
|
|
55 |
|
|
|
56 |
refresh_repo(commit_ids, c.app.repo)
|
|
|
57 |
|
|
|
58 |
# Refresh child references
|
|
|
59 |
seen = set()
|
|
|
60 |
parents = set()
|
|
|
61 |
|
|
|
62 |
for i, oid in enumerate(commit_ids):
|
|
|
63 |
ci = CommitDoc.m.find(dict(_id=oid), validate=False).next()
|
|
|
64 |
refresh_children(ci)
|
|
|
65 |
seen.add(ci._id)
|
|
|
66 |
parents.update(ci.parent_ids)
|
|
|
67 |
if (i+1) % 100 == 0:
|
|
|
68 |
log.info('Refresh child (a) info %d: %s', (i+1), ci._id)
|
|
|
69 |
for j, oid in enumerate(parents-seen):
|
|
|
70 |
try:
|
|
|
71 |
ci = CommitDoc.m.find(dict(_id=oid), validate=False).next()
|
|
|
72 |
except StopIteration:
|
|
|
73 |
continue
|
|
|
74 |
refresh_children(ci)
|
|
|
75 |
if (i + j + 1) % 100 == 0:
|
|
|
76 |
log.info('Refresh child (b) info %d: %s', (i + j + 1), ci._id)
|
|
|
77 |
|
|
|
78 |
# Refresh commit runs
|
|
|
79 |
rb = CommitRunBuilder(commit_ids)
|
|
|
80 |
rb.run()
|
|
|
81 |
rb.cleanup()
|
|
|
82 |
|
|
|
83 |
# Refresh trees
|
|
|
84 |
cache = {}
|
|
|
85 |
for i, oid in enumerate(commit_ids):
|
|
|
86 |
ci = CommitDoc.m.find(dict(_id=oid), validate=False).next()
|
|
|
87 |
cache = refresh_commit_trees(ci, cache)
|
|
|
88 |
if (i+1) % 100 == 0:
|
|
|
89 |
log.info('Refresh commit trees %d: %s', (i+1), ci._id)
|
|
|
90 |
|
|
|
91 |
# Compute diffs
|
|
|
92 |
cache = {}
|
|
|
93 |
for i, oid in enumerate(commit_ids):
|
|
|
94 |
ci = CommitDoc.m.find(dict(_id=oid), validate=False).next()
|
|
|
95 |
compute_diffs(c.app.repo._id, cache, ci)
|
|
|
96 |
if (i+1) % 100 == 0:
|
|
|
97 |
log.info('Compute diffs %d: %s', (i+1), ci._id)
|
|
|
98 |
|
|
|
99 |
def refresh_commit_trees(ci, cache):
|
|
|
100 |
trees_doc = TreesDoc(dict(
|
|
|
101 |
_id=ci._id,
|
|
|
102 |
tree_ids = list(trees(ci.tree_id, cache))))
|
|
|
103 |
trees_doc.m.save(safe=False)
|
|
|
104 |
new_cache = dict(
|
|
|
105 |
(oid, cache[oid])
|
|
|
106 |
for oid in trees_doc.tree_ids)
|
|
|
107 |
return new_cache
|
|
|
108 |
|
|
|
109 |
def refresh_commit_info(ci, seen):
|
|
|
110 |
if CommitDoc.m.find(dict(_id=ci.hexsha)).count() != 0:
|
|
|
111 |
return False
|
|
|
112 |
try:
|
|
|
113 |
ci_doc = CommitDoc(dict(
|
|
|
114 |
_id=ci.hexsha,
|
|
|
115 |
tree_id=ci.tree.hexsha,
|
|
|
116 |
committed = Object(
|
|
|
117 |
name=h.really_unicode(ci.committer.name),
|
|
|
118 |
email=h.really_unicode(ci.committer.email),
|
|
|
119 |
date=datetime.utcfromtimestamp(
|
|
|
120 |
ci.committed_date-ci.committer_tz_offset)),
|
|
|
121 |
authored = Object(
|
|
|
122 |
name=h.really_unicode(ci.author.name),
|
|
|
123 |
email=h.really_unicode(ci.author.email),
|
|
|
124 |
date=datetime.utcfromtimestamp(
|
|
|
125 |
ci.authored_date-ci.author_tz_offset)),
|
|
|
126 |
message=h.really_unicode(ci.message or ''),
|
|
|
127 |
child_ids=[],
|
|
|
128 |
parent_ids = [ p.hexsha for p in ci.parents ]))
|
|
|
129 |
ci_doc.m.insert(safe=True)
|
|
|
130 |
except DuplicateKeyError:
|
|
|
131 |
return False
|
|
|
132 |
refresh_tree(ci.tree, seen)
|
|
|
133 |
return True
|
|
|
134 |
|
|
|
135 |
def refresh_repo(commit_ids, repo):
|
|
|
136 |
for oids in utils.chunked_iter(commit_ids, QSIZE):
|
|
|
137 |
oids = list(oids)
|
|
|
138 |
# Create shortlinks and artifactrefs
|
|
|
139 |
for oid in oids:
|
|
|
140 |
index_id = 'allura.model.repo.Commit#' + oid
|
|
|
141 |
ref = ArtifactReferenceDoc(dict(
|
|
|
142 |
_id=index_id,
|
|
|
143 |
artifact_reference=dict(
|
|
|
144 |
cls=dumps(Commit),
|
|
|
145 |
project_id=repo.app.config.project_id,
|
|
|
146 |
app_config_id=repo.app.config._id,
|
|
|
147 |
artifact_id=oid),
|
|
|
148 |
references=[]))
|
|
|
149 |
link = ShortlinkDoc(dict(
|
|
|
150 |
_id=bson.ObjectId(),
|
|
|
151 |
ref_id=index_id,
|
|
|
152 |
project_id=repo.app.config.project_id,
|
|
|
153 |
app_config_id=repo.app.config._id,
|
|
|
154 |
link=repo.shorthand_for_commit(oid),
|
|
|
155 |
url=repo.url() + 'ci/' + oid + '/'))
|
|
|
156 |
ref.m.save(safe=False, validate=False)
|
|
|
157 |
link.m.save(safe=False, validate=False)
|
|
|
158 |
CommitDoc.m.update_partial(
|
|
|
159 |
dict(
|
|
|
160 |
_id={'$in': oids},
|
|
|
161 |
repo_ids={'$ne': repo._id}),
|
|
|
162 |
{'$addToSet': dict(repo_ids=repo._id)},
|
|
|
163 |
multi=True)
|
|
|
164 |
|
|
|
165 |
def refresh_children(ci):
|
|
|
166 |
CommitDoc.m.update_partial(
|
|
|
167 |
dict(_id={'$in': ci.parent_ids}),
|
|
|
168 |
{'$addToSet': dict(child_ids=ci._id)},
|
|
|
169 |
multi=True)
|
|
|
170 |
|
|
|
171 |
class CommitRunBuilder(object):
|
|
|
172 |
|
|
|
173 |
def __init__(self, commit_ids):
|
|
|
174 |
self.commit_ids = commit_ids
|
|
|
175 |
self.run_index = {} # by commit ID
|
|
|
176 |
self.runs = {} # by run ID
|
|
|
177 |
self.reasons = {} # reasons to stop merging runs
|
|
|
178 |
|
|
|
179 |
def run(self):
|
|
|
180 |
for oids in utils.chunked_iter(self.commit_ids, QSIZE):
|
|
|
181 |
oids = list(oids)
|
|
|
182 |
commits = list(CommitDoc.m.find(dict(_id={'$in':oids})))
|
|
|
183 |
for ci in commits:
|
|
|
184 |
if ci._id in self.run_index: continue
|
|
|
185 |
self.run_index[ci._id] = ci._id
|
|
|
186 |
self.runs[ci._id] = CommitRunDoc(dict(
|
|
|
187 |
_id=ci._id,
|
|
|
188 |
parent_commit_ids=ci.parent_ids,
|
|
|
189 |
commit_ids=[ci._id],
|
|
|
190 |
commit_times=[ci.authored.date]))
|
|
|
191 |
self.merge_runs()
|
|
|
192 |
log.info('%d runs', len(self.runs))
|
|
|
193 |
for rid, run in sorted(self.runs.items()):
|
|
|
194 |
log.info('%32s: %r', self.reasons.get(rid, 'none'), run._id)
|
|
|
195 |
for run in self.runs.itervalues():
|
|
|
196 |
run.m.save()
|
|
|
197 |
return self.runs
|
|
|
198 |
|
|
|
199 |
def _all_runs(self):
|
|
|
200 |
runs = {}
|
|
|
201 |
for oids in utils.chunked_iter(self.commit_ids, QSIZE):
|
|
|
202 |
oids = list(oids)
|
|
|
203 |
for run in CommitRunDoc.m.find(dict(commit_ids={'$in': oids})):
|
|
|
204 |
runs[run._id] = run
|
|
|
205 |
seen_run_ids = set()
|
|
|
206 |
runs = runs.values()
|
|
|
207 |
while runs:
|
|
|
208 |
run = runs.pop()
|
|
|
209 |
if run._id in seen_run_ids: continue
|
|
|
210 |
seen_run_ids.add(run._id)
|
|
|
211 |
yield run
|
|
|
212 |
for run in CommitRunDoc.m.find(
|
|
|
213 |
dict(commit_ids={'$in':run.parent_commit_ids})):
|
|
|
214 |
runs.append(run)
|
|
|
215 |
|
|
|
216 |
def cleanup(self):
|
|
|
217 |
'''Delete non-maximal runs'''
|
|
|
218 |
for run1 in self._all_runs():
|
|
|
219 |
for run2 in CommitRunDoc.m.find(dict(
|
|
|
220 |
commit_ids=run1.commit_ids[0])):
|
|
|
221 |
if run1._id == run2._id: continue
|
|
|
222 |
log.info('... delete %r (part of %r)', run2, run1)
|
|
|
223 |
run2.m.delete()
|
|
|
224 |
|
|
|
225 |
def merge_runs(self):
|
|
|
226 |
while True:
|
|
|
227 |
for run_id, run in self.runs.iteritems():
|
|
|
228 |
if len(run.parent_commit_ids) != 1:
|
|
|
229 |
self.reasons[run_id] = '%d parents' % len(run.parent_commit_ids)
|
|
|
230 |
continue
|
|
|
231 |
p_oid = run.parent_commit_ids[0]
|
|
|
232 |
p_run_id = self.run_index.get(p_oid)
|
|
|
233 |
if p_run_id is None:
|
|
|
234 |
self.reasons[run_id] = 'parent commit not found'
|
|
|
235 |
continue
|
|
|
236 |
p_run = self.runs.get(p_run_id)
|
|
|
237 |
if p_run is None:
|
|
|
238 |
self.reasons[run_id] = 'parent run not found'
|
|
|
239 |
continue
|
|
|
240 |
if p_run.commit_ids[0] != p_oid:
|
|
|
241 |
self.reasons[run_id] = 'parent does not start with parent commit'
|
|
|
242 |
continue
|
|
|
243 |
run.commit_ids += p_run.commit_ids
|
|
|
244 |
run.commit_times += p_run.commit_times
|
|
|
245 |
run.parent_commit_ids = p_run.parent_commit_ids
|
|
|
246 |
for oid in p_run.commit_ids:
|
|
|
247 |
self.run_index[oid] = run_id
|
|
|
248 |
break
|
|
|
249 |
else:
|
|
|
250 |
break
|
|
|
251 |
del self.runs[p_run_id]
|
|
|
252 |
|
|
|
253 |
def refresh_tree(t, seen):
|
|
|
254 |
if t.binsha in seen: return
|
|
|
255 |
seen.add(t.binsha)
|
|
|
256 |
doc = TreeDoc(dict(
|
|
|
257 |
_id=t.hexsha,
|
|
|
258 |
tree_ids=[],
|
|
|
259 |
blob_ids=[],
|
|
|
260 |
other_ids=[]))
|
|
|
261 |
for o in t:
|
|
|
262 |
obj = Object(
|
|
|
263 |
name=h.really_unicode(o.name),
|
|
|
264 |
id=o.hexsha)
|
|
|
265 |
if o.type == 'tree':
|
|
|
266 |
refresh_tree(o, seen)
|
|
|
267 |
doc.tree_ids.append(obj)
|
|
|
268 |
elif o.type == 'blob':
|
|
|
269 |
doc.blob_ids.append(obj)
|
|
|
270 |
else:
|
|
|
271 |
obj.type = o.type
|
|
|
272 |
doc.other_ids.append(obj)
|
|
|
273 |
doc.m.save(safe=False)
|
|
|
274 |
|
|
|
275 |
def trees(id, cache):
|
|
|
276 |
yield id
|
|
|
277 |
entries = cache.get(id, None)
|
|
|
278 |
if entries is None:
|
|
|
279 |
t = TreeDoc.m.get(_id=id)
|
|
|
280 |
entries = [ o.id for o in t.tree_ids ]
|
|
|
281 |
cache[id] = entries
|
|
|
282 |
for i in entries:
|
|
|
283 |
for x in trees(i, cache):
|
|
|
284 |
yield x
|
|
|
285 |
|
|
|
286 |
def unknown_commit_ids(all_commit_ids):
|
|
|
287 |
result = []
|
|
|
288 |
for chunk in utils.chunked_iter(all_commit_ids, QSIZE):
|
|
|
289 |
q = CommitDoc.m.find(_id={'$in':chunk})
|
|
|
290 |
known_commit_ids = set(ci._id for ci in q)
|
|
|
291 |
result += [ oid for oid in chunk if oid not in known_commit_ids ]
|
|
|
292 |
return result
|
|
|
293 |
|
|
|
294 |
def compute_diffs(repo_id, tree_cache, rhs_ci):
|
|
|
295 |
def _walk_tree(tree, tree_index):
|
|
|
296 |
for x in tree.blob_ids: yield x.id
|
|
|
297 |
for x in tree.other_ids: yield x.id
|
|
|
298 |
for x in tree.tree_ids:
|
|
|
299 |
yield x.id
|
|
|
300 |
for xx in _walk_tree(tree_index[x.id], tree_index):
|
|
|
301 |
yield xx
|
|
|
302 |
|
|
|
303 |
rhs_tree_ids = TreesDoc.m.get(_id=rhs_ci._id).tree_ids
|
|
|
304 |
if rhs_ci.parent_ids:
|
|
|
305 |
lhs_ci = CommitDoc.m.get(_id=rhs_ci.parent_ids[0])
|
|
|
306 |
else:
|
|
|
307 |
lhs_ci = None
|
|
|
308 |
if lhs_ci is not None:
|
|
|
309 |
lhs_tree_ids = TreesDoc.m.get(_id=lhs_ci._id).tree_ids
|
|
|
310 |
else:
|
|
|
311 |
lhs_tree_ids = []
|
|
|
312 |
new_tree_ids = [
|
|
|
313 |
tid for tid in chain(lhs_tree_ids, rhs_tree_ids)
|
|
|
314 |
if tid not in tree_cache ]
|
|
|
315 |
tree_index = dict(
|
|
|
316 |
(t._id, t) for t in TreeDoc.m.find(dict(_id={'$in': new_tree_ids}),validate=False))
|
|
|
317 |
tree_index.update(tree_cache)
|
|
|
318 |
rhs_tree_ids_set = set(rhs_tree_ids)
|
|
|
319 |
tree_cache.clear()
|
|
|
320 |
tree_cache.update(
|
|
|
321 |
(id, t) for id,t in tree_index.iteritems() if id in rhs_tree_ids_set)
|
|
|
322 |
rhs_tree = tree_index[rhs_ci.tree_id]
|
|
|
323 |
if lhs_ci is None:
|
|
|
324 |
lhs_tree = Object(_id=None, tree_ids=[], blob_ids=[], other_ids=[])
|
|
|
325 |
else:
|
|
|
326 |
lhs_tree = tree_index[lhs_ci.tree_id]
|
|
|
327 |
differences = []
|
|
|
328 |
for name, lhs_id, rhs_id in _diff_trees(lhs_tree, rhs_tree, tree_index):
|
|
|
329 |
differences.append(
|
|
|
330 |
dict(name=name, lhs_id=lhs_id, rhs_id=rhs_id))
|
|
|
331 |
# Set last commit info
|
|
|
332 |
if rhs_id is not None:
|
|
|
333 |
_set_last_commit(repo_id, rhs_id, rhs_ci)
|
|
|
334 |
rhs_tree = tree_index.get(rhs_id, None)
|
|
|
335 |
if rhs_tree is not None:
|
|
|
336 |
for oid in _walk_tree(rhs_tree, tree_index):
|
|
|
337 |
_set_last_commit(repo_id, oid, rhs_ci)
|
|
|
338 |
di = DiffInfoDoc(dict(
|
|
|
339 |
_id=rhs_ci._id,
|
|
|
340 |
differences=differences))
|
|
|
341 |
di.m.save()
|
|
|
342 |
return tree_cache
|
|
|
343 |
|
|
|
344 |
def _diff_trees(lhs, rhs, index, *path):
|
|
|
345 |
def _fq(name):
|
|
|
346 |
return '/'.join(reversed(
|
|
|
347 |
(name,) + path))
|
|
|
348 |
# Diff the trees
|
|
|
349 |
rhs_tree_ids = dict(
|
|
|
350 |
(o.name, o.id)
|
|
|
351 |
for o in rhs.tree_ids)
|
|
|
352 |
for o in lhs.tree_ids:
|
|
|
353 |
rhs_id = rhs_tree_ids.pop(o.name, None)
|
|
|
354 |
if rhs_id == o.id:
|
|
|
355 |
continue # no change
|
|
|
356 |
elif rhs_id is None:
|
|
|
357 |
yield (_fq(o.name), o.id, None)
|
|
|
358 |
else:
|
|
|
359 |
for difference in _diff_trees(
|
|
|
360 |
index[o.id], index[rhs_id], index,
|
|
|
361 |
o.name, *path):
|
|
|
362 |
yield difference
|
|
|
363 |
for name, id in rhs_tree_ids.items():
|
|
|
364 |
yield (_fq(name), None, id)
|
|
|
365 |
# DIff the blobs
|
|
|
366 |
rhs_blob_ids = dict(
|
|
|
367 |
(o.name, o.id)
|
|
|
368 |
for o in rhs.blob_ids)
|
|
|
369 |
for o in lhs.blob_ids:
|
|
|
370 |
rhs_id = rhs_blob_ids.pop(o.name, None)
|
|
|
371 |
if rhs_id == o.id:
|
|
|
372 |
continue # no change
|
|
|
373 |
elif rhs_id is None:
|
|
|
374 |
yield (_fq(o.name), o.id, None)
|
|
|
375 |
else:
|
|
|
376 |
yield (_fq(o.name), o.id, rhs_id)
|
|
|
377 |
for name, id in rhs_blob_ids.items():
|
|
|
378 |
yield (_fq(name), None, id)
|
|
|
379 |
|
|
|
380 |
def _set_last_commit(repo_id, oid, commit):
|
|
|
381 |
lc = LastCommitDoc(dict(
|
|
|
382 |
_id='%s:%s' % (repo_id, oid),
|
|
|
383 |
repo_id=repo_id,
|
|
|
384 |
object_id=oid,
|
|
|
385 |
commit_info=dict(
|
|
|
386 |
id=commit._id,
|
|
|
387 |
author=commit.authored.name,
|
|
|
388 |
author_email=commit.authored.email,
|
|
|
389 |
date=commit.authored.date,
|
|
|
390 |
# author_url=commit.author_url,
|
|
|
391 |
# href=commit.url(),
|
|
|
392 |
# shortlink=commit.shorthand_id(),
|
|
|
393 |
# summary=commit.summary
|
|
|
394 |
)))
|
|
|
395 |
lc.m.save(safe=False)
|
|
|
396 |
return lc
|
|
|
397 |
|
27 |
|
398 |
if __name__ == '__main__':
|
28 |
if __name__ == '__main__':
|
399 |
main()
|
29 |
main()
|
400 |
# dolog()
|
30 |
# dolog()
|