a/Allura/allura/model/repo_refresh.py b/Allura/allura/model/repo_refresh.py
1
import logging
1
import logging
2
from itertools import chain
2
from itertools import chain
3
from cPickle import dumps
3
from cPickle import dumps
4
import re
4
import re
5
import os
5
6
6
import bson
7
import bson
7
8
8
import tg
9
import tg
9
10
10
from pylons import g
11
from pylons import g,c
11
12
12
from ming.base import Object
13
from ming.base import Object
13
from ming.orm import mapper, session
14
from ming.orm import mapper, session, ThreadLocalORMSession
14
15
15
from allura.lib import utils
16
from allura.lib import utils
16
from allura.lib import helpers as h
17
from allura.lib import helpers as h
17
from allura.model.repo import CommitDoc, TreeDoc, TreesDoc, DiffInfoDoc
18
from allura.model.repo import CommitDoc, TreeDoc, TreesDoc, DiffInfoDoc
18
from allura.model.repo import LastCommitDoc, CommitRunDoc
19
from allura.model.repo import LastCommitDoc, CommitRunDoc
19
from allura.model.repo import Commit
20
from allura.model.repo import Commit, Tree, LastCommit, ModelCache
20
from allura.model.index import ArtifactReferenceDoc, ShortlinkDoc
21
from allura.model.index import ArtifactReferenceDoc, ShortlinkDoc
21
22
22
log = logging.getLogger(__name__)
23
log = logging.getLogger(__name__)
23
24
24
QSIZE=100
25
QSIZE=100
...
...
55
        ci = CommitDoc.m.find(dict(_id=oid), validate=False).next()
56
        ci = CommitDoc.m.find(dict(_id=oid), validate=False).next()
56
        refresh_children(ci)
57
        refresh_children(ci)
57
        if (i+1) % 100 == 0:
58
        if (i+1) % 100 == 0:
58
            log.info('Refresh child info %d for parents of %s', (i+1), ci._id)
59
            log.info('Refresh child info %d for parents of %s', (i+1), ci._id)
59
60
60
    if repo.tool.lower() != 'svn':
61
    if repo._refresh_precompute:
61
        # Refresh commit runs
62
        # Refresh commit runs
62
        commit_run_ids = commit_ids
63
        commit_run_ids = commit_ids
63
        # Check if the CommitRuns for the repo are in a good state by checking for
64
        # Check if the CommitRuns for the repo are in a good state by checking for
64
        # a CommitRunDoc that contains the last known commit. If there isn't one,
65
        # a CommitRunDoc that contains the last known commit. If there isn't one,
65
        # the CommitRuns for this repo are in a bad state - rebuild them entirely.
66
        # the CommitRuns for this repo are in a bad state - rebuild them entirely.
...
...
86
            if (i+1) % 100 == 0:
87
            if (i+1) % 100 == 0:
87
                log.info('Refresh commit trees %d: %s', (i+1), ci._id)
88
                log.info('Refresh commit trees %d: %s', (i+1), ci._id)
88
89
89
    # Compute diffs
90
    # Compute diffs
90
    cache = {}
91
    cache = {}
91
    # Have to compute_diffs() for all commits to ensure that LastCommitDocs
92
    # are set properly for forked repos. For some SCMs, compute_diffs()
93
    # we don't want to pre-compute the diffs because that would be too
92
    # For some SCMs, we don't want to pre-compute the diffs because that
94
    # expensive, so we skip them here and do them on-demand with caching.
93
    # would be too expensive, so we skip them here and do them on-demand
94
    # with caching.
95
    if repo._refresh_precompute:
95
    if repo._refresh_precompute:
96
        for i, oid in enumerate(reversed(all_commit_ids)):
96
        for i, oid in enumerate(commit_ids):
97
            ci = CommitDoc.m.find(dict(_id=oid), validate=False).next()
97
            ci = CommitDoc.m.find(dict(_id=oid), validate=False).next()
98
            compute_diffs(repo._id, cache, ci)
98
            compute_diffs(repo._id, cache, ci)
99
            if (i+1) % 100 == 0:
99
            if (i+1) % 100 == 0:
100
                log.info('Compute diffs %d: %s', (i+1), ci._id)
100
                log.info('Compute diffs %d: %s', (i+1), ci._id)
101
102
    if repo._refresh_precompute:
103
        cache = ModelCache()
104
        for i, oid in enumerate(reversed(commit_ids)):
105
            ci = cache.get(Commit, dict(_id=oid))
106
            ci.set_context(repo)
107
            compute_lcds(ci, cache)
108
            ThreadLocalORMSession.flush_all()
109
            if (i+1) % 100 == 0:
110
                log.info('Compute last commit info %d: %s', (i+1), ci._id)
111
101
112
102
    log.info('Refresh complete for %s', repo.full_fs_path)
113
    log.info('Refresh complete for %s', repo.full_fs_path)
103
    g.post_event(
114
    g.post_event(
104
            'repo_refreshed',
115
            'repo_refreshed',
105
            commit_number=len(commit_ids),
116
            commit_number=len(commit_ids),
...
...
295
    return result
306
    return result
296
307
297
def compute_diffs(repo_id, tree_cache, rhs_ci):
308
def compute_diffs(repo_id, tree_cache, rhs_ci):
298
    '''compute simple differences between a commit and its first parent'''
309
    '''compute simple differences between a commit and its first parent'''
299
    if rhs_ci.tree_id is None: return tree_cache
310
    if rhs_ci.tree_id is None: return tree_cache
300
    def _walk_tree(tree, tree_index):
301
        for x in tree.blob_ids: yield x.id
302
        for x in tree.other_ids: yield x.id
303
        for x in tree.tree_ids:
304
            yield x.id
305
            for xx in _walk_tree(tree_index[x.id], tree_index):
306
                yield xx
307
311
312
    def _update_cache(lhs_tree_ids, rhs_tree_ids):
313
        # crazy cache logic that I'm not certain I understand
314
        new_tree_ids = [
315
            tid for tid in chain(lhs_tree_ids, rhs_tree_ids)
316
            if tid not in tree_cache ]
317
        tree_index = dict(
318
            (t._id, t) for t in TreeDoc.m.find(dict(_id={'$in': new_tree_ids}),validate=False))
319
        tree_index.update(tree_cache)
320
        rhs_tree_ids_set = set(rhs_tree_ids)
321
        tree_cache.clear()
322
        tree_cache.update(
323
            (id, t) for id,t in tree_index.iteritems() if id in rhs_tree_ids_set)
324
        return tree_index
325
326
    empty_tree = Object(_id=None, tree_ids=[], blob_ids=[], other_ids=[])
327
    commit_info = get_commit_info(rhs_ci)
328
    differences = []
308
    treedoc = TreesDoc.m.get(_id=rhs_ci._id)
329
    rhs_treesdoc = TreesDoc.m.get(_id=rhs_ci._id)
309
310
    # FIXME: There are cases of missing TreesDoc records in production
311
    #        that should be fixed, but this is a quick-and-dirty patch
312
    #        to at least staunch the bleeding.  A "generate-if-missing"
313
    #        fix, and/or, even better, a cleanup / regen sweep plus
314
    #        audit to ensure there're no more bugs causing them to be
315
    #        missed should be done.
316
    if not treedoc:
330
    if not rhs_treesdoc:
331
        # FIXME: These sometimes don't exist for unknown reasons; they should be auto-gen'ed
332
        log.error('Missing TreesDoc: %s', rhs_ci)
317
        return tree_cache
333
        return tree_cache
318
319
    rhs_tree_ids = treedoc.tree_ids
320
321
    if rhs_ci.parent_ids:
334
    for lhs_cid in rhs_ci.parent_ids:
322
        lhs_ci = CommitDoc.m.get(_id=rhs_ci.parent_ids[0])
335
        lhs_ci = CommitDoc.m.get(_id=lhs_cid)
323
    else:
324
        lhs_ci = None
325
    if lhs_ci is not None:
326
        lhs_tree_ids = TreesDoc.m.get(_id=lhs_ci._id).tree_ids
327
    else:
328
        lhs_tree_ids = []
329
    new_tree_ids = [
330
        tid for tid in chain(lhs_tree_ids, rhs_tree_ids)
331
        if tid not in tree_cache ]
332
    tree_index = dict(
333
        (t._id, t) for t in TreeDoc.m.find(dict(_id={'$in': new_tree_ids}),validate=False))
334
    tree_index.update(tree_cache)
335
    rhs_tree_ids_set = set(rhs_tree_ids)
336
    tree_cache.clear()
337
    tree_cache.update(
338
        (id, t) for id,t in tree_index.iteritems() if id in rhs_tree_ids_set)
339
    rhs_tree = tree_index[rhs_ci.tree_id]
340
    if lhs_ci is None:
336
        if lhs_ci is None:
341
        lhs_tree = Object(_id=None, tree_ids=[], blob_ids=[], other_ids=[])
337
            log.error('Commit ID referenced as parent but not found: %s parent of %s', lhs_cid, rhs_ci)
342
    else:
338
            continue
339
        lhs_treesdoc = TreesDoc.m.get(_id=lhs_cid)
340
        if not lhs_treesdoc:
341
            # FIXME: These sometimes don't exist for unknown reasons; they should be auto-gen'ed
342
            log.error('Missing TreesDoc: %s', rhs_ci)
343
            continue
344
        tree_index = _update_cache(lhs_treesdoc.tree_ids, rhs_treesdoc.tree_ids)
343
        lhs_tree = tree_index[lhs_ci.tree_id]
345
        rhs_tree = tree_index[rhs_ci.tree_id]
344
    differences = []
346
        lhs_tree = tree_index.get(lhs_ci.tree_id, empty_tree)
345
    commit_info = get_commit_info(rhs_ci)
346
    for name, lhs_id, rhs_id in _diff_trees(lhs_tree, rhs_tree, tree_index):
347
        for name, lhs_id, rhs_id in _diff_trees(lhs_tree, rhs_tree, tree_index):
347
        differences.append(
348
            differences.append(
348
            dict(name=name, lhs_id=lhs_id, rhs_id=rhs_id))
349
                dict(name=name, lhs_id=lhs_id, rhs_id=rhs_id))
349
    # Set last commit data
350
    if not rhs_ci.parent_ids:
351
        # no parents, so everything in rhs is new
352
        tree_index = _update_cache([], rhs_treesdoc.tree_ids)
350
    rhs_tree = tree_index[rhs_ci.tree_id]
353
        rhs_tree = tree_index[rhs_ci.tree_id]
351
    refresh_last_commit(repo_id, '/', rhs_tree, lhs_tree, None, commit_info)
354
        for name, lhs_id, rhs_id in _diff_trees(empty_tree, rhs_tree, tree_index):
355
            differences.append(
356
                dict(name=name, lhs_id=lhs_id, rhs_id=rhs_id))
352
    # Build the diffinfo
357
    # Build the diffinfo
353
    di = DiffInfoDoc(dict(
358
    di = DiffInfoDoc(dict(
354
            _id=rhs_ci._id,
359
            _id=rhs_ci._id,
355
            differences=differences))
360
            differences=differences))
356
    di.m.save()
361
    di.m.save()
...
...
418
    # Diff the trees
423
    # Diff the trees
419
    rhs_tree_ids = dict(
424
    rhs_tree_ids = dict(
420
        (o.name, o.id)
425
        (o.name, o.id)
421
        for o in rhs.tree_ids)
426
        for o in rhs.tree_ids)
422
    for o in lhs.tree_ids:
427
    for o in lhs.tree_ids:
423
        rhs_id = rhs_tree_ids.pop(o.name, None)
428
        rhs_id = rhs_tree_ids.pop(o.name, None)  # remove so won't be picked up as added, below
424
        if rhs_id == o.id:
429
        if rhs_id == o.id:  # no change
425
            continue # no change
430
            continue
426
        elif rhs_id is None:
431
        elif rhs_id is None:  # removed
427
            yield (_fq(o.name), o.id, None)
432
            yield (_fq(o.name), o.id, None)
428
        else:
433
            rhs_tree = Object(_id=None, tree_ids=[], blob_ids=[], other_ids=[])
429
            for difference in _diff_trees(
434
        else:  # changed
430
                index[o.id], index[rhs_id], index,
435
            rhs_tree = index[rhs_id]
431
                o.name, *path):
436
        for difference in _diff_trees(index[o.id], rhs_tree, index, o.name, *path):
432
                yield difference
437
            yield difference
433
    for name, id in rhs_tree_ids.items():
438
    for name, id in rhs_tree_ids.items():  # added
434
        yield (_fq(name), None, id)
439
        yield (_fq(name), None, id)
440
        lhs_tree = Object(_id=None, tree_ids=[], blob_ids=[], other_ids=[])
441
        for difference in _diff_trees(lhs_tree, index[id], index, name, *path):
442
            yield difference
435
    # DIff the blobs
443
    # DIff the blobs
436
    rhs_blob_ids = dict(
444
    rhs_blob_ids = dict(
437
        (o.name, o.id)
445
        (o.name, o.id)
438
        for o in rhs.blob_ids)
446
        for o in rhs.blob_ids)
439
    for o in lhs.blob_ids:
447
    for o in lhs.blob_ids:
...
...
460
        author_url=commit.author_url,
468
        author_url=commit.author_url,
461
        shortlink=commit.shorthand_id(),
469
        shortlink=commit.shorthand_id(),
462
        summary=commit.summary
470
        summary=commit.summary
463
        )
471
        )
464
472
465
def refresh_last_commit(repo_id, path, tree, lhs_tree, parent_tree, commit_info):
466
    '''Build the LastCommit info.
467
468
    We only need to create LastCommit info for objects that are in the
469
    RHS but not in the LHS, because only those objects are only ones
470
    who have had anything changed in them.  (If file x/y/z.txt changes,
471
    then it's hash will change, which also forces the hash for tree x/y
472
    to change, as well as the hash for tree x.  So as long as an object's
473
    hash isn't in the LHS, it means it's new or modified in this commit.)
474
475
    In order to uniquely identify the tree or blob that a LastCommitDoc is
476
    for, the tree or blob hash is not sufficient; we also need to know
477
    either it's full path name, or it's parent tree and name.  Because of
478
    this, we have to walk down the commit tree.'''
479
    if lhs_tree is not None and tree._id == lhs_tree._id:
480
        # tree was not changed in this commit (nor was anything under it)
481
        return
482
483
    # map LHS entries for easy lookup
484
    lhs_map = {}
485
    if lhs_tree:
486
        for lhs_child in chain(lhs_tree.tree_ids, lhs_tree.blob_ids, lhs_tree.other_ids):
487
            lhs_map[lhs_child.name] = lhs_child.id
488
489
    # update our children
490
    for child in chain(tree.tree_ids, tree.blob_ids, tree.other_ids):
491
        if child.id != lhs_map.get(child.name, None):  # check if changed in this commit
492
            lc = set_last_commit(repo_id, path, child.name, child.id, commit_info)
493
494
    # (re)curse at our child trees
495
    for child_tree in tree.tree_ids:
496
        child_name = child_tree.name
497
        child_tree = TreeDoc.m.get(_id=child_tree.id)
498
        lhs_child = None
499
        if child_name in lhs_map:
500
            lhs_child = TreeDoc.m.get(_id=lhs_map[child_name])
501
        refresh_last_commit(repo_id, path + child_name + '/', child_tree, lhs_child, tree, commit_info)
502
503
def set_last_commit(repo_id, path, name, oid, commit_info):
504
    lc = LastCommitDoc(dict(
505
            _id='%s:%s:%s' % (repo_id, path, name),
506
            object_id=oid,
507
            name=name,
508
            commit_info=commit_info))
509
    lc.m.save(safe=False, upsert=True)
510
    return lc
511
512
def last_known_commit_id(all_commit_ids, new_commit_ids):
473
def last_known_commit_id(all_commit_ids, new_commit_ids):
513
    """
474
    """
514
    Return the newest "known" (cached in mongo) commit id.
475
    Return the newest "known" (cached in mongo) commit id.
515
476
516
    Params:
477
    Params:
...
...
520
                        oldest to newest.
481
                        oldest to newest.
521
    """
482
    """
522
    if not all_commit_ids: return None
483
    if not all_commit_ids: return None
523
    if not new_commit_ids: return all_commit_ids[-1]
484
    if not new_commit_ids: return all_commit_ids[-1]
524
    return all_commit_ids[all_commit_ids.index(new_commit_ids[0]) - 1]
485
    return all_commit_ids[all_commit_ids.index(new_commit_ids[0]) - 1]
486
487
488
def compute_lcds(commit, cache):
489
    '''
490
    Compute LastCommit data for every Tree node under this tree.
491
    '''
492
    trees = cache.get(TreesDoc, dict(_id=commit._id))
493
    if not trees:
494
        log.error('Missing TreesDoc for %s; skipping compute_lcd' % commit)
495
        return
496
    with h.push_config(c, model_cache=cache):
497
        _update_tree_cache(trees.tree_ids, cache)
498
        tree = _pull_tree(cache, commit.tree_id, commit)
499
        _compute_lcds(tree, cache)
500
501
def _compute_lcds(tree, cache):
502
    if tree.path().strip('/') not in tree.commit.changed_paths:
503
        return
504
    lcd = LastCommit.get(tree, create=True)  # auto-vivify LCD
505
    for x in tree.tree_ids:
506
        sub_tree = _pull_tree(cache, x.id, tree, x.name)
507
        _compute_lcds(sub_tree, cache)
508
509
def _pull_tree(cache, tree_id, *context):
510
    '''
511
    Since the Tree instances stick around in our cache,
512
    subsequent calls to set_context are overwriting our
513
    in-use copies and confusing the walk.  So, make an
514
    memory-only copy for our use.
515
    '''
516
    cache_tree = cache.get(Tree, dict(_id=tree_id))
517
    new_tree = Tree(
518
            _id=cache_tree._id,
519
            tree_ids=cache_tree.tree_ids,
520
            blob_ids=cache_tree.blob_ids,
521
            other_ids=cache_tree.other_ids,
522
        )
523
    session(new_tree).expunge(new_tree)
524
    new_tree.set_context(*context)
525
    return new_tree
526
527
def _update_tree_cache(tree_ids, cache):
528
    current_ids = set(tree_ids)
529
    cached_ids = set(cache.instance_ids(Tree))
530
    new_ids = current_ids - cached_ids
531
    cache.batch_load(Tree, {'_id': {'$in': list(new_ids)}})