|
a/Allura/allura/model/repo_refresh.py |
|
b/Allura/allura/model/repo_refresh.py |
1 |
import logging
|
1 |
import logging
|
2 |
from itertools import chain
|
2 |
from itertools import chain
|
3 |
from cPickle import dumps
|
3 |
from cPickle import dumps
|
4 |
import re
|
4 |
import re
|
|
|
5 |
import os
|
5 |
|
6 |
|
6 |
import bson
|
7 |
import bson
|
7 |
|
8 |
|
8 |
import tg
|
9 |
import tg
|
9 |
|
10 |
|
10 |
from pylons import g
|
11 |
from pylons import g,c
|
11 |
|
12 |
|
12 |
from ming.base import Object
|
13 |
from ming.base import Object
|
13 |
from ming.orm import mapper, session
|
14 |
from ming.orm import mapper, session, ThreadLocalORMSession
|
14 |
|
15 |
|
15 |
from allura.lib import utils
|
16 |
from allura.lib import utils
|
16 |
from allura.lib import helpers as h
|
17 |
from allura.lib import helpers as h
|
17 |
from allura.model.repo import CommitDoc, TreeDoc, TreesDoc, DiffInfoDoc
|
18 |
from allura.model.repo import CommitDoc, TreeDoc, TreesDoc, DiffInfoDoc
|
18 |
from allura.model.repo import LastCommitDoc, CommitRunDoc
|
19 |
from allura.model.repo import LastCommitDoc, CommitRunDoc
|
19 |
from allura.model.repo import Commit
|
20 |
from allura.model.repo import Commit, Tree, LastCommit, ModelCache
|
20 |
from allura.model.index import ArtifactReferenceDoc, ShortlinkDoc
|
21 |
from allura.model.index import ArtifactReferenceDoc, ShortlinkDoc
|
21 |
|
22 |
|
22 |
log = logging.getLogger(__name__)
|
23 |
log = logging.getLogger(__name__)
|
23 |
|
24 |
|
24 |
QSIZE=100
|
25 |
QSIZE=100
|
|
... |
|
... |
55 |
ci = CommitDoc.m.find(dict(_id=oid), validate=False).next()
|
56 |
ci = CommitDoc.m.find(dict(_id=oid), validate=False).next()
|
56 |
refresh_children(ci)
|
57 |
refresh_children(ci)
|
57 |
if (i+1) % 100 == 0:
|
58 |
if (i+1) % 100 == 0:
|
58 |
log.info('Refresh child info %d for parents of %s', (i+1), ci._id)
|
59 |
log.info('Refresh child info %d for parents of %s', (i+1), ci._id)
|
59 |
|
60 |
|
60 |
if repo.tool.lower() != 'svn':
|
61 |
if repo._refresh_precompute:
|
61 |
# Refresh commit runs
|
62 |
# Refresh commit runs
|
62 |
commit_run_ids = commit_ids
|
63 |
commit_run_ids = commit_ids
|
63 |
# Check if the CommitRuns for the repo are in a good state by checking for
|
64 |
# Check if the CommitRuns for the repo are in a good state by checking for
|
64 |
# a CommitRunDoc that contains the last known commit. If there isn't one,
|
65 |
# a CommitRunDoc that contains the last known commit. If there isn't one,
|
65 |
# the CommitRuns for this repo are in a bad state - rebuild them entirely.
|
66 |
# the CommitRuns for this repo are in a bad state - rebuild them entirely.
|
|
... |
|
... |
86 |
if (i+1) % 100 == 0:
|
87 |
if (i+1) % 100 == 0:
|
87 |
log.info('Refresh commit trees %d: %s', (i+1), ci._id)
|
88 |
log.info('Refresh commit trees %d: %s', (i+1), ci._id)
|
88 |
|
89 |
|
89 |
# Compute diffs
|
90 |
# Compute diffs
|
90 |
cache = {}
|
91 |
cache = {}
|
91 |
# Have to compute_diffs() for all commits to ensure that LastCommitDocs
|
|
|
92 |
# are set properly for forked repos. For some SCMs, compute_diffs()
|
|
|
93 |
# we don't want to pre-compute the diffs because that would be too
|
92 |
# For some SCMs, we don't want to pre-compute the diffs because that
|
94 |
# expensive, so we skip them here and do them on-demand with caching.
|
93 |
# would be too expensive, so we skip them here and do them on-demand
|
|
|
94 |
# with caching.
|
95 |
if repo._refresh_precompute:
|
95 |
if repo._refresh_precompute:
|
96 |
for i, oid in enumerate(reversed(all_commit_ids)):
|
96 |
for i, oid in enumerate(commit_ids):
|
97 |
ci = CommitDoc.m.find(dict(_id=oid), validate=False).next()
|
97 |
ci = CommitDoc.m.find(dict(_id=oid), validate=False).next()
|
98 |
compute_diffs(repo._id, cache, ci)
|
98 |
compute_diffs(repo._id, cache, ci)
|
99 |
if (i+1) % 100 == 0:
|
99 |
if (i+1) % 100 == 0:
|
100 |
log.info('Compute diffs %d: %s', (i+1), ci._id)
|
100 |
log.info('Compute diffs %d: %s', (i+1), ci._id)
|
|
|
101 |
|
|
|
102 |
if repo._refresh_precompute:
|
|
|
103 |
cache = ModelCache()
|
|
|
104 |
for i, oid in enumerate(reversed(commit_ids)):
|
|
|
105 |
ci = cache.get(Commit, dict(_id=oid))
|
|
|
106 |
ci.set_context(repo)
|
|
|
107 |
compute_lcds(ci, cache)
|
|
|
108 |
ThreadLocalORMSession.flush_all()
|
|
|
109 |
if (i+1) % 100 == 0:
|
|
|
110 |
log.info('Compute last commit info %d: %s', (i+1), ci._id)
|
|
|
111 |
|
101 |
|
112 |
|
102 |
log.info('Refresh complete for %s', repo.full_fs_path)
|
113 |
log.info('Refresh complete for %s', repo.full_fs_path)
|
103 |
g.post_event(
|
114 |
g.post_event(
|
104 |
'repo_refreshed',
|
115 |
'repo_refreshed',
|
105 |
commit_number=len(commit_ids),
|
116 |
commit_number=len(commit_ids),
|
|
... |
|
... |
295 |
return result
|
306 |
return result
|
296 |
|
307 |
|
297 |
def compute_diffs(repo_id, tree_cache, rhs_ci):
|
308 |
def compute_diffs(repo_id, tree_cache, rhs_ci):
|
298 |
'''compute simple differences between a commit and its first parent'''
|
309 |
'''compute simple differences between a commit and its first parent'''
|
299 |
if rhs_ci.tree_id is None: return tree_cache
|
310 |
if rhs_ci.tree_id is None: return tree_cache
|
300 |
def _walk_tree(tree, tree_index):
|
|
|
301 |
for x in tree.blob_ids: yield x.id
|
|
|
302 |
for x in tree.other_ids: yield x.id
|
|
|
303 |
for x in tree.tree_ids:
|
|
|
304 |
yield x.id
|
|
|
305 |
for xx in _walk_tree(tree_index[x.id], tree_index):
|
|
|
306 |
yield xx
|
|
|
307 |
|
311 |
|
|
|
312 |
def _update_cache(lhs_tree_ids, rhs_tree_ids):
|
|
|
313 |
# crazy cache logic that I'm not certain I understand
|
|
|
314 |
new_tree_ids = [
|
|
|
315 |
tid for tid in chain(lhs_tree_ids, rhs_tree_ids)
|
|
|
316 |
if tid not in tree_cache ]
|
|
|
317 |
tree_index = dict(
|
|
|
318 |
(t._id, t) for t in TreeDoc.m.find(dict(_id={'$in': new_tree_ids}),validate=False))
|
|
|
319 |
tree_index.update(tree_cache)
|
|
|
320 |
rhs_tree_ids_set = set(rhs_tree_ids)
|
|
|
321 |
tree_cache.clear()
|
|
|
322 |
tree_cache.update(
|
|
|
323 |
(id, t) for id,t in tree_index.iteritems() if id in rhs_tree_ids_set)
|
|
|
324 |
return tree_index
|
|
|
325 |
|
|
|
326 |
empty_tree = Object(_id=None, tree_ids=[], blob_ids=[], other_ids=[])
|
|
|
327 |
commit_info = get_commit_info(rhs_ci)
|
|
|
328 |
differences = []
|
308 |
treedoc = TreesDoc.m.get(_id=rhs_ci._id)
|
329 |
rhs_treesdoc = TreesDoc.m.get(_id=rhs_ci._id)
|
309 |
|
|
|
310 |
# FIXME: There are cases of missing TreesDoc records in production
|
|
|
311 |
# that should be fixed, but this is a quick-and-dirty patch
|
|
|
312 |
# to at least staunch the bleeding. A "generate-if-missing"
|
|
|
313 |
# fix, and/or, even better, a cleanup / regen sweep plus
|
|
|
314 |
# audit to ensure there're no more bugs causing them to be
|
|
|
315 |
# missed should be done.
|
|
|
316 |
if not treedoc:
|
330 |
if not rhs_treesdoc:
|
|
|
331 |
# FIXME: These sometimes don't exist for unknown reasons; they should be auto-gen'ed
|
|
|
332 |
log.error('Missing TreesDoc: %s', rhs_ci)
|
317 |
return tree_cache
|
333 |
return tree_cache
|
318 |
|
|
|
319 |
rhs_tree_ids = treedoc.tree_ids
|
|
|
320 |
|
|
|
321 |
if rhs_ci.parent_ids:
|
334 |
for lhs_cid in rhs_ci.parent_ids:
|
322 |
lhs_ci = CommitDoc.m.get(_id=rhs_ci.parent_ids[0])
|
335 |
lhs_ci = CommitDoc.m.get(_id=lhs_cid)
|
323 |
else:
|
|
|
324 |
lhs_ci = None
|
|
|
325 |
if lhs_ci is not None:
|
|
|
326 |
lhs_tree_ids = TreesDoc.m.get(_id=lhs_ci._id).tree_ids
|
|
|
327 |
else:
|
|
|
328 |
lhs_tree_ids = []
|
|
|
329 |
new_tree_ids = [
|
|
|
330 |
tid for tid in chain(lhs_tree_ids, rhs_tree_ids)
|
|
|
331 |
if tid not in tree_cache ]
|
|
|
332 |
tree_index = dict(
|
|
|
333 |
(t._id, t) for t in TreeDoc.m.find(dict(_id={'$in': new_tree_ids}),validate=False))
|
|
|
334 |
tree_index.update(tree_cache)
|
|
|
335 |
rhs_tree_ids_set = set(rhs_tree_ids)
|
|
|
336 |
tree_cache.clear()
|
|
|
337 |
tree_cache.update(
|
|
|
338 |
(id, t) for id,t in tree_index.iteritems() if id in rhs_tree_ids_set)
|
|
|
339 |
rhs_tree = tree_index[rhs_ci.tree_id]
|
|
|
340 |
if lhs_ci is None:
|
336 |
if lhs_ci is None:
|
341 |
lhs_tree = Object(_id=None, tree_ids=[], blob_ids=[], other_ids=[])
|
337 |
log.error('Commit ID referenced as parent but not found: %s parent of %s', lhs_cid, rhs_ci)
|
342 |
else:
|
338 |
continue
|
|
|
339 |
lhs_treesdoc = TreesDoc.m.get(_id=lhs_cid)
|
|
|
340 |
if not lhs_treesdoc:
|
|
|
341 |
# FIXME: These sometimes don't exist for unknown reasons; they should be auto-gen'ed
|
|
|
342 |
log.error('Missing TreesDoc: %s', rhs_ci)
|
|
|
343 |
continue
|
|
|
344 |
tree_index = _update_cache(lhs_treesdoc.tree_ids, rhs_treesdoc.tree_ids)
|
343 |
lhs_tree = tree_index[lhs_ci.tree_id]
|
345 |
rhs_tree = tree_index[rhs_ci.tree_id]
|
344 |
differences = []
|
346 |
lhs_tree = tree_index.get(lhs_ci.tree_id, empty_tree)
|
345 |
commit_info = get_commit_info(rhs_ci)
|
|
|
346 |
for name, lhs_id, rhs_id in _diff_trees(lhs_tree, rhs_tree, tree_index):
|
347 |
for name, lhs_id, rhs_id in _diff_trees(lhs_tree, rhs_tree, tree_index):
|
347 |
differences.append(
|
348 |
differences.append(
|
348 |
dict(name=name, lhs_id=lhs_id, rhs_id=rhs_id))
|
349 |
dict(name=name, lhs_id=lhs_id, rhs_id=rhs_id))
|
349 |
# Set last commit data
|
350 |
if not rhs_ci.parent_ids:
|
|
|
351 |
# no parents, so everything in rhs is new
|
|
|
352 |
tree_index = _update_cache([], rhs_treesdoc.tree_ids)
|
350 |
rhs_tree = tree_index[rhs_ci.tree_id]
|
353 |
rhs_tree = tree_index[rhs_ci.tree_id]
|
351 |
refresh_last_commit(repo_id, '/', rhs_tree, lhs_tree, None, commit_info)
|
354 |
for name, lhs_id, rhs_id in _diff_trees(empty_tree, rhs_tree, tree_index):
|
|
|
355 |
differences.append(
|
|
|
356 |
dict(name=name, lhs_id=lhs_id, rhs_id=rhs_id))
|
352 |
# Build the diffinfo
|
357 |
# Build the diffinfo
|
353 |
di = DiffInfoDoc(dict(
|
358 |
di = DiffInfoDoc(dict(
|
354 |
_id=rhs_ci._id,
|
359 |
_id=rhs_ci._id,
|
355 |
differences=differences))
|
360 |
differences=differences))
|
356 |
di.m.save()
|
361 |
di.m.save()
|
|
... |
|
... |
418 |
# Diff the trees
|
423 |
# Diff the trees
|
419 |
rhs_tree_ids = dict(
|
424 |
rhs_tree_ids = dict(
|
420 |
(o.name, o.id)
|
425 |
(o.name, o.id)
|
421 |
for o in rhs.tree_ids)
|
426 |
for o in rhs.tree_ids)
|
422 |
for o in lhs.tree_ids:
|
427 |
for o in lhs.tree_ids:
|
423 |
rhs_id = rhs_tree_ids.pop(o.name, None)
|
428 |
rhs_id = rhs_tree_ids.pop(o.name, None) # remove so won't be picked up as added, below
|
424 |
if rhs_id == o.id:
|
429 |
if rhs_id == o.id: # no change
|
425 |
continue # no change
|
430 |
continue
|
426 |
elif rhs_id is None:
|
431 |
elif rhs_id is None: # removed
|
427 |
yield (_fq(o.name), o.id, None)
|
432 |
yield (_fq(o.name), o.id, None)
|
428 |
else:
|
433 |
rhs_tree = Object(_id=None, tree_ids=[], blob_ids=[], other_ids=[])
|
429 |
for difference in _diff_trees(
|
434 |
else: # changed
|
430 |
index[o.id], index[rhs_id], index,
|
435 |
rhs_tree = index[rhs_id]
|
431 |
o.name, *path):
|
436 |
for difference in _diff_trees(index[o.id], rhs_tree, index, o.name, *path):
|
432 |
yield difference
|
437 |
yield difference
|
433 |
for name, id in rhs_tree_ids.items():
|
438 |
for name, id in rhs_tree_ids.items(): # added
|
434 |
yield (_fq(name), None, id)
|
439 |
yield (_fq(name), None, id)
|
|
|
440 |
lhs_tree = Object(_id=None, tree_ids=[], blob_ids=[], other_ids=[])
|
|
|
441 |
for difference in _diff_trees(lhs_tree, index[id], index, name, *path):
|
|
|
442 |
yield difference
|
435 |
# DIff the blobs
|
443 |
# DIff the blobs
|
436 |
rhs_blob_ids = dict(
|
444 |
rhs_blob_ids = dict(
|
437 |
(o.name, o.id)
|
445 |
(o.name, o.id)
|
438 |
for o in rhs.blob_ids)
|
446 |
for o in rhs.blob_ids)
|
439 |
for o in lhs.blob_ids:
|
447 |
for o in lhs.blob_ids:
|
|
... |
|
... |
460 |
author_url=commit.author_url,
|
468 |
author_url=commit.author_url,
|
461 |
shortlink=commit.shorthand_id(),
|
469 |
shortlink=commit.shorthand_id(),
|
462 |
summary=commit.summary
|
470 |
summary=commit.summary
|
463 |
)
|
471 |
)
|
464 |
|
472 |
|
465 |
def refresh_last_commit(repo_id, path, tree, lhs_tree, parent_tree, commit_info):
|
|
|
466 |
'''Build the LastCommit info.
|
|
|
467 |
|
|
|
468 |
We only need to create LastCommit info for objects that are in the
|
|
|
469 |
RHS but not in the LHS, because only those objects are only ones
|
|
|
470 |
who have had anything changed in them. (If file x/y/z.txt changes,
|
|
|
471 |
then it's hash will change, which also forces the hash for tree x/y
|
|
|
472 |
to change, as well as the hash for tree x. So as long as an object's
|
|
|
473 |
hash isn't in the LHS, it means it's new or modified in this commit.)
|
|
|
474 |
|
|
|
475 |
In order to uniquely identify the tree or blob that a LastCommitDoc is
|
|
|
476 |
for, the tree or blob hash is not sufficient; we also need to know
|
|
|
477 |
either it's full path name, or it's parent tree and name. Because of
|
|
|
478 |
this, we have to walk down the commit tree.'''
|
|
|
479 |
if lhs_tree is not None and tree._id == lhs_tree._id:
|
|
|
480 |
# tree was not changed in this commit (nor was anything under it)
|
|
|
481 |
return
|
|
|
482 |
|
|
|
483 |
# map LHS entries for easy lookup
|
|
|
484 |
lhs_map = {}
|
|
|
485 |
if lhs_tree:
|
|
|
486 |
for lhs_child in chain(lhs_tree.tree_ids, lhs_tree.blob_ids, lhs_tree.other_ids):
|
|
|
487 |
lhs_map[lhs_child.name] = lhs_child.id
|
|
|
488 |
|
|
|
489 |
# update our children
|
|
|
490 |
for child in chain(tree.tree_ids, tree.blob_ids, tree.other_ids):
|
|
|
491 |
if child.id != lhs_map.get(child.name, None): # check if changed in this commit
|
|
|
492 |
lc = set_last_commit(repo_id, path, child.name, child.id, commit_info)
|
|
|
493 |
|
|
|
494 |
# (re)curse at our child trees
|
|
|
495 |
for child_tree in tree.tree_ids:
|
|
|
496 |
child_name = child_tree.name
|
|
|
497 |
child_tree = TreeDoc.m.get(_id=child_tree.id)
|
|
|
498 |
lhs_child = None
|
|
|
499 |
if child_name in lhs_map:
|
|
|
500 |
lhs_child = TreeDoc.m.get(_id=lhs_map[child_name])
|
|
|
501 |
refresh_last_commit(repo_id, path + child_name + '/', child_tree, lhs_child, tree, commit_info)
|
|
|
502 |
|
|
|
503 |
def set_last_commit(repo_id, path, name, oid, commit_info):
|
|
|
504 |
lc = LastCommitDoc(dict(
|
|
|
505 |
_id='%s:%s:%s' % (repo_id, path, name),
|
|
|
506 |
object_id=oid,
|
|
|
507 |
name=name,
|
|
|
508 |
commit_info=commit_info))
|
|
|
509 |
lc.m.save(safe=False, upsert=True)
|
|
|
510 |
return lc
|
|
|
511 |
|
|
|
512 |
def last_known_commit_id(all_commit_ids, new_commit_ids):
|
473 |
def last_known_commit_id(all_commit_ids, new_commit_ids):
|
513 |
"""
|
474 |
"""
|
514 |
Return the newest "known" (cached in mongo) commit id.
|
475 |
Return the newest "known" (cached in mongo) commit id.
|
515 |
|
476 |
|
516 |
Params:
|
477 |
Params:
|
|
... |
|
... |
520 |
oldest to newest.
|
481 |
oldest to newest.
|
521 |
"""
|
482 |
"""
|
522 |
if not all_commit_ids: return None
|
483 |
if not all_commit_ids: return None
|
523 |
if not new_commit_ids: return all_commit_ids[-1]
|
484 |
if not new_commit_ids: return all_commit_ids[-1]
|
524 |
return all_commit_ids[all_commit_ids.index(new_commit_ids[0]) - 1]
|
485 |
return all_commit_ids[all_commit_ids.index(new_commit_ids[0]) - 1]
|
|
|
486 |
|
|
|
487 |
|
|
|
488 |
def compute_lcds(commit, cache):
|
|
|
489 |
'''
|
|
|
490 |
Compute LastCommit data for every Tree node under this tree.
|
|
|
491 |
'''
|
|
|
492 |
trees = cache.get(TreesDoc, dict(_id=commit._id))
|
|
|
493 |
if not trees:
|
|
|
494 |
log.error('Missing TreesDoc for %s; skipping compute_lcd' % commit)
|
|
|
495 |
return
|
|
|
496 |
with h.push_config(c, model_cache=cache):
|
|
|
497 |
_update_tree_cache(trees.tree_ids, cache)
|
|
|
498 |
tree = _pull_tree(cache, commit.tree_id, commit)
|
|
|
499 |
_compute_lcds(tree, cache)
|
|
|
500 |
|
|
|
501 |
def _compute_lcds(tree, cache):
|
|
|
502 |
if tree.path().strip('/') not in tree.commit.changed_paths:
|
|
|
503 |
return
|
|
|
504 |
lcd = LastCommit.get(tree, create=True) # auto-vivify LCD
|
|
|
505 |
for x in tree.tree_ids:
|
|
|
506 |
sub_tree = _pull_tree(cache, x.id, tree, x.name)
|
|
|
507 |
_compute_lcds(sub_tree, cache)
|
|
|
508 |
|
|
|
509 |
def _pull_tree(cache, tree_id, *context):
|
|
|
510 |
'''
|
|
|
511 |
Since the Tree instances stick around in our cache,
|
|
|
512 |
subsequent calls to set_context are overwriting our
|
|
|
513 |
in-use copies and confusing the walk. So, make an
|
|
|
514 |
memory-only copy for our use.
|
|
|
515 |
'''
|
|
|
516 |
cache_tree = cache.get(Tree, dict(_id=tree_id))
|
|
|
517 |
new_tree = Tree(
|
|
|
518 |
_id=cache_tree._id,
|
|
|
519 |
tree_ids=cache_tree.tree_ids,
|
|
|
520 |
blob_ids=cache_tree.blob_ids,
|
|
|
521 |
other_ids=cache_tree.other_ids,
|
|
|
522 |
)
|
|
|
523 |
session(new_tree).expunge(new_tree)
|
|
|
524 |
new_tree.set_context(*context)
|
|
|
525 |
return new_tree
|
|
|
526 |
|
|
|
527 |
def _update_tree_cache(tree_ids, cache):
|
|
|
528 |
current_ids = set(tree_ids)
|
|
|
529 |
cached_ids = set(cache.instance_ids(Tree))
|
|
|
530 |
new_ids = current_ids - cached_ids
|
|
|
531 |
cache.batch_load(Tree, {'_id': {'$in': list(new_ids)}})
|