Switch to unified view

a b/Allura/allura/model/repo_refresh.py
1
import logging
2
from itertools import chain
3
from cPickle import dumps
4
5
import bson
6
from tg import config
7
8
from ming.base import Object
9
10
from allura.lib import utils
11
from allura.model.repo import CommitDoc, TreeDoc, TreesDoc, DiffInfoDoc
12
from allura.model.repo import LastCommitDoc, CommitRunDoc
13
from allura.model.repo import Commit
14
from allura.model.index import ArtifactReferenceDoc, ShortlinkDoc
15
16
log = logging.getLogger(__name__)
17
18
QSIZE=100
19
20
def refresh_repo(repo, all_commits=False, notify=True):
21
    all_commit_ids = commit_ids = list(repo.all_commit_ids())
22
    if not all_commits:
23
        # Skip commits that are already in the DB
24
        commit_ids = unknown_commit_ids(commit_ids)
25
    log.info('Refreshing %d commits', len(commit_ids))
26
27
    # Refresh commits
28
    seen = set()
29
    for i, oid in enumerate(commit_ids):
30
        repo.refresh_commit_info(oid, seen)
31
        if (i+1) % 100 == 0:
32
            log.info('Refresh commit info %d: %s', (i+1), oid)
33
34
    refresh_commit_repos(all_commit_ids, repo)
35
36
    # Refresh child references
37
    seen = set()
38
    parents = set()
39
40
    for i, oid in enumerate(commit_ids):
41
        ci = CommitDoc.m.find(dict(_id=oid), validate=False).next()
42
        refresh_children(ci)
43
        seen.add(ci._id)
44
        parents.update(ci.parent_ids)
45
        if (i+1) % 100 == 0:
46
            log.info('Refresh child (a) info %d: %s', (i+1), ci._id)
47
    for j, oid in enumerate(parents-seen):
48
        try:
49
            ci = CommitDoc.m.find(dict(_id=oid), validate=False).next()
50
        except StopIteration:
51
            continue
52
        refresh_children(ci)
53
        if (i + j + 1) % 100 == 0:
54
            log.info('Refresh child (b) info %d: %s', (i + j + 1), ci._id)
55
56
    # Refresh commit runs
57
    rb = CommitRunBuilder(commit_ids)
58
    rb.run()
59
    rb.cleanup()
60
61
    # Refresh trees
62
    cache = {}
63
    for i, oid in enumerate(commit_ids):
64
        ci = CommitDoc.m.find(dict(_id=oid), validate=False).next()
65
        cache = refresh_commit_trees(ci, cache)
66
        if (i+1) % 100 == 0:
67
            log.info('Refresh commit trees %d: %s', (i+1), ci._id)
68
69
    # Compute diffs
70
    cache = {}
71
    for i, oid in enumerate(commit_ids):
72
        ci = CommitDoc.m.find(dict(_id=oid), validate=False).next()
73
        compute_diffs(repo._id, cache, ci)
74
        if (i+1) % 100 == 0:
75
            log.info('Compute diffs %d: %s', (i+1), ci._id)
76
77
    # Send notifications
78
    if notify:
79
        send_notifications(commit_ids)
80
81
def refresh_commit_trees(ci, cache):
82
    '''Refresh the list of trees included withn a commit'''
83
    trees_doc = TreesDoc(dict(
84
            _id=ci._id,
85
            tree_ids = list(trees(ci.tree_id, cache))))
86
    trees_doc.m.save(safe=False)
87
    new_cache = dict(
88
        (oid, cache[oid])
89
        for oid in trees_doc.tree_ids)
90
    return new_cache
91
92
def refresh_commit_repos(all_commit_ids, repo):
93
    '''Refresh the list of repositories within which a set of commits are
94
    contained'''
95
    for oids in utils.chunked_iter(all_commit_ids, QSIZE):
96
        for ci in CommitDoc.m.find(dict(
97
                _id={'$in':list(oids)},
98
                repo_ids={'$ne': repo._id})):
99
            oid = ci._id
100
            ci.repo_ids.append(repo._id)
101
            index_id = 'allura.model.repo.Commit#' + oid
102
            ref = ArtifactReferenceDoc(dict(
103
                    _id=index_id,
104
                    artifact_reference=dict(
105
                        cls=dumps(Commit),
106
                        project_id=repo.app.config.project_id,
107
                    app_config_id=repo.app.config._id,
108
                        artifact_id=oid),
109
                    references=[]))
110
            link = ShortlinkDoc(dict(
111
                    _id=bson.ObjectId(),
112
                    ref_id=index_id,
113
                    project_id=repo.app.config.project_id,
114
                    app_config_id=repo.app.config._id,
115
                    link=repo.shorthand_for_commit(oid)[1:-1],
116
                    url=repo.url() + 'ci/' + oid + '/'))
117
            # Always create a link for the full commit ID
118
            link = ShortlinkDoc(dict(
119
                    _id=bson.ObjectId(),
120
                    ref_id=index_id,
121
                    project_id=repo.app.config.project_id,
122
                    app_config_id=repo.app.config._id,
123
                    link=oid,
124
                    url=repo.url() + 'ci/' + oid + '/'))
125
            ref.m.save(safe=False, validate=False)
126
            link.m.save(safe=False, validate=False)
127
128
def refresh_children(ci):
129
    '''Refresh the list of children of the given commit'''
130
    CommitDoc.m.update_partial(
131
        dict(_id={'$in': ci.parent_ids}),
132
        {'$addToSet': dict(child_ids=ci._id)},
133
        multi=True)
134
135
class CommitRunBuilder(object):
136
    '''Class used to build up linear runs of single-parent commits'''
137
138
    def __init__(self, commit_ids):
139
        self.commit_ids = commit_ids
140
        self.run_index = {} # by commit ID
141
        self.runs = {}          # by run ID
142
        self.reasons = {}    # reasons to stop merging runs
143
144
    def run(self):
145
        '''Build up the runs'''
146
        for oids in utils.chunked_iter(self.commit_ids, QSIZE):
147
            oids = list(oids)
148
            for ci in CommitDoc.m.find(dict(_id={'$in':oids})):
149
                if ci._id in self.run_index: continue
150
                self.run_index[ci._id] = ci._id
151
                self.runs[ci._id] = CommitRunDoc(dict(
152
                        _id=ci._id,
153
                        parent_commit_ids=ci.parent_ids,
154
                        commit_ids=[ci._id],
155
                        commit_times=[ci.authored.date]))
156
            self.merge_runs()
157
        log.info('%d runs', len(self.runs))
158
        for rid, run in sorted(self.runs.items()):
159
            log.info('%32s: %r', self.reasons.get(rid, 'none'), run._id)
160
        for run in self.runs.itervalues():
161
            run.m.save()
162
        return self.runs
163
164
    def _all_runs(self):
165
        '''Find all runs containing this builder's commit IDs'''
166
        runs = {}
167
        for oids in utils.chunked_iter(self.commit_ids, QSIZE):
168
            oids = list(oids)
169
            for run in CommitRunDoc.m.find(dict(commit_ids={'$in': oids})):
170
                runs[run._id] = run
171
        seen_run_ids = set()
172
        runs = runs.values()
173
        while runs:
174
            run = runs.pop()
175
            if run._id in seen_run_ids: continue
176
            seen_run_ids.add(run._id)
177
            yield run
178
            for run in CommitRunDoc.m.find(
179
                dict(commit_ids={'$in':run.parent_commit_ids})):
180
                runs.append(run)
181
182
    def cleanup(self):
183
        '''Delete non-maximal runs'''
184
        for run1 in self._all_runs():
185
            for run2 in CommitRunDoc.m.find(dict(
186
                    commit_ids=run1.commit_ids[0])):
187
                if run1._id == run2._id: continue
188
                log.info('... delete %r (part of %r)', run2, run1)
189
                run2.m.delete()
190
191
    def merge_runs(self):
192
        '''Find partial runs that may be merged and merge them'''
193
        while True:
194
            for run_id, run in self.runs.iteritems():
195
                if len(run.parent_commit_ids) != 1:
196
                    self.reasons[run_id] = '%d parents' % len(run.parent_commit_ids)
197
                    continue
198
                p_oid = run.parent_commit_ids[0]
199
                p_run_id = self.run_index.get(p_oid)
200
                if p_run_id is None:
201
                    self.reasons[run_id] = 'parent commit not found'
202
                    continue
203
                p_run = self.runs.get(p_run_id)
204
                if p_run is None:
205
                    self.reasons[run_id] = 'parent run not found'
206
                    continue
207
                if p_run.commit_ids[0] != p_oid:
208
                    self.reasons[run_id] = 'parent does not start with parent commit'
209
                    continue
210
                run.commit_ids += p_run.commit_ids
211
                run.commit_times += p_run.commit_times
212
                run.parent_commit_ids = p_run.parent_commit_ids
213
                for oid in p_run.commit_ids:
214
                    self.run_index[oid] = run_id
215
                break
216
            else:
217
                break
218
            del self.runs[p_run_id]
219
220
def trees(id, cache):
221
    '''Recursively generate the list of trees contained within a given tree ID'''
222
    yield id
223
    entries = cache.get(id, None)
224
    if entries is None:
225
        t = TreeDoc.m.get(_id=id)
226
        entries = [ o.id for o in t.tree_ids ]
227
        cache[id] = entries
228
    for i in entries:
229
        for x in trees(i, cache):
230
            yield x
231
232
def unknown_commit_ids(all_commit_ids):
233
    '''filter out all commit ids that have already been cached'''
234
    result = []
235
    for chunk in utils.chunked_iter(all_commit_ids, QSIZE):
236
        q = CommitDoc.m.find(_id={'$in':chunk})
237
        known_commit_ids = set(ci._id for ci in q)
238
        result += [ oid for oid in chunk if oid not in known_commit_ids ]
239
    return result
240
241
def compute_diffs(repo_id, tree_cache, rhs_ci):
242
    '''compute simple differences between a commit and its first parent'''
243
    def _walk_tree(tree, tree_index):
244
        for x in tree.blob_ids: yield x.id
245
        for x in tree.other_ids: yield x.id
246
        for x in tree.tree_ids:
247
            yield x.id
248
            for xx in _walk_tree(tree_index[x.id], tree_index):
249
                yield xx
250
251
    rhs_tree_ids = TreesDoc.m.get(_id=rhs_ci._id).tree_ids
252
    if rhs_ci.parent_ids:
253
        lhs_ci = CommitDoc.m.get(_id=rhs_ci.parent_ids[0])
254
    else:
255
        lhs_ci = None
256
    if lhs_ci is not None:
257
        lhs_tree_ids = TreesDoc.m.get(_id=lhs_ci._id).tree_ids
258
    else:
259
        lhs_tree_ids = []
260
    new_tree_ids = [
261
        tid for tid in chain(lhs_tree_ids, rhs_tree_ids)
262
        if tid not in tree_cache ]
263
    tree_index = dict(
264
        (t._id, t) for t in TreeDoc.m.find(dict(_id={'$in': new_tree_ids}),validate=False))
265
    tree_index.update(tree_cache)
266
    rhs_tree_ids_set = set(rhs_tree_ids)
267
    tree_cache.clear()
268
    tree_cache.update(
269
        (id, t) for id,t in tree_index.iteritems() if id in rhs_tree_ids_set)
270
    rhs_tree = tree_index[rhs_ci.tree_id]
271
    if lhs_ci is None:
272
        lhs_tree = Object(_id=None, tree_ids=[], blob_ids=[], other_ids=[])
273
    else:
274
        lhs_tree = tree_index[lhs_ci.tree_id]
275
    differences = []
276
    for name, lhs_id, rhs_id in _diff_trees(lhs_tree, rhs_tree, tree_index):
277
        differences.append(
278
            dict(name=name, lhs_id=lhs_id, rhs_id=rhs_id))
279
        # Set last commit info
280
        if rhs_id is not None:
281
            _set_last_commit(repo_id, rhs_id, rhs_ci)
282
        rhs_tree = tree_index.get(rhs_id, None)
283
        if rhs_tree is not None:
284
            for oid in _walk_tree(rhs_tree, tree_index):
285
                _set_last_commit(repo_id, oid, rhs_ci)
286
    di = DiffInfoDoc(dict(
287
            _id=rhs_ci._id,
288
            differences=differences))
289
    di.m.save()
290
    return tree_cache
291
292
def send_notifications(repo, commit_ids):
293
    '''Create appropriate notification and feed objects for a refresh'''
294
    from allura.model import Feed, Notification
295
    commit_msgs = []
296
    for oids in utils.chunked_iter(commit_ids, QSIZE):
297
        chunk = list(oids)
298
        index = dict(
299
            (doc._id, doc)
300
            for doc in CommitDoc.m.find(dict(_id={'$in':chunk})))
301
        for oid in chunk:
302
            ci = index[oid]
303
            href = '%s%sci/%s/' % (
304
                config.common_prefix,
305
                repo.url(),
306
                oid)
307
            summary = _summarize(ci.message)
308
            item = Feed.post(
309
                repo, title='New commit',
310
                description='%s<br><a href="%s/">View Changes</a>' % (
311
                    summary, href))
312
            item.author_link = ci.author_url
313
            item.author_name = ci.authored.name
314
            commit_msgs.append('%s by %s <%s>' % (
315
                    summary, ci.authored.name, href))
316
    if commit_msgs:
317
        if len(commit_msgs) > 1:
318
            subject = '%d new commits to %s %s' % (
319
                len(commit_msgs), repo.app.project.name, repo.app.config.options.mount_label)
320
            text='\n\n'.join(commit_msgs)
321
        else:
322
            subject = '%s committed to %s %s: %s' % (
323
                ci.authored.name,
324
                repo.app.project.name,
325
                repo.app.config.options.mount_label,
326
                summary)
327
            text = ci.message
328
        Notification.post(
329
            artifact=repo,
330
            topic='metadata',
331
            subject=subject,
332
            text=text)
333
334
def _summarize(message):
335
    if not message: return ''
336
    summary = []
337
    for line in message.splitlines():
338
        line = line.rstrip()
339
        if line: summary.append(line)
340
        else: break
341
    return ' '.join(summary)
342
343
def _diff_trees(lhs, rhs, index, *path):
344
    def _fq(name):
345
        return '/'.join(reversed(
346
                (name,) + path))
347
    # Diff the trees
348
    rhs_tree_ids = dict(
349
        (o.name, o.id)
350
        for o in rhs.tree_ids)
351
    for o in lhs.tree_ids:
352
        rhs_id = rhs_tree_ids.pop(o.name, None)
353
        if rhs_id == o.id:
354
            continue # no change
355
        elif rhs_id is None:
356
            yield (_fq(o.name), o.id, None)
357
        else:
358
            for difference in _diff_trees(
359
                index[o.id], index[rhs_id], index,
360
                o.name, *path):
361
                yield difference
362
    for name, id in rhs_tree_ids.items():
363
        yield (_fq(name), None, id)
364
    # DIff the blobs
365
    rhs_blob_ids = dict(
366
        (o.name, o.id)
367
        for o in rhs.blob_ids)
368
    for o in lhs.blob_ids:
369
        rhs_id = rhs_blob_ids.pop(o.name, None)
370
        if rhs_id == o.id:
371
            continue # no change
372
        elif rhs_id is None:
373
            yield (_fq(o.name), o.id, None)
374
        else:
375
            yield (_fq(o.name), o.id, rhs_id)
376
    for name, id in rhs_blob_ids.items():
377
        yield (_fq(name), None, id)
378
379
def _set_last_commit(repo_id, oid, commit):
380
    lc = LastCommitDoc(dict(
381
            _id='%s:%s' % (repo_id, oid),
382
            repo_id=repo_id,
383
            object_id=oid,
384
            commit_info=dict(
385
                id=commit._id,
386
                author=commit.authored.name,
387
                author_email=commit.authored.email,
388
                date=commit.authored.date,
389
                # author_url=commit.author_url,
390
                # href=commit.url(),
391
                # shortlink=commit.shorthand_id(),
392
                # summary=commit.summary
393
                )))
394
    lc.m.save(safe=False)
395
    return lc