Switch to unified view

a/Allura/test-light.py b/Allura/test-light.py
1
import sys
1
import sys
2
import logging
3
from collections import defaultdict
4
from itertools import chain, izip
5
from datetime import datetime
6
from cPickle import dumps
7
2
8
import bson
9
from pylons import c
3
from pylons import c
10
from pymongo.errors import DuplicateKeyError
11
12
from ming.base import Object
13
4
14
from allura.lib import helpers as h
5
from allura.lib import helpers as h
15
from allura.lib import utils
16
from allura.model.repo import CommitDoc, TreeDoc, TreesDoc, DiffInfoDoc
6
from allura.model.repo import CommitDoc, TreeDoc, TreesDoc, DiffInfoDoc
17
from allura.model.repo import LastCommitDoc, CommitRunDoc
7
from allura.model.repo import LastCommitDoc, CommitRunDoc
18
from allura.model.repo import Commit
8
from allura.model.repo_refresh import refresh_repo
19
from allura.model.index import ArtifactReferenceDoc, ShortlinkDoc
20
21
log = logging.getLogger(__name__)
22
23
QSIZE=100
24
9
25
def main():
10
def main():
26
    if len(sys.argv) > 1:
11
    if len(sys.argv) > 1:
27
        h.set_context('test')
12
        h.set_context('test')
28
        c.project.install_app('Git', 'code', 'Code', init_from_url='/home/rick446/src/forge')
13
        c.project.install_app('Git', 'code', 'Code', init_from_url='/home/rick446/src/forge')
29
    h.set_context('test', 'code')
14
        c.project.install_app('Hg', 'code2', 'Code2', init_from_url='/home/rick446/src/Kajiki')
30
    CommitDoc.m.remove({})
15
    CommitDoc.m.remove({})
31
    TreeDoc.m.remove({})
16
    TreeDoc.m.remove({})
32
    TreesDoc.m.remove({})
17
    TreesDoc.m.remove({})
33
    DiffInfoDoc.m.remove({})
18
    DiffInfoDoc.m.remove({})
34
    LastCommitDoc.m.remove({})
19
    LastCommitDoc.m.remove({})
35
    CommitRunDoc.m.remove({})
20
    CommitRunDoc.m.remove({})
36
21
37
    # Get all commits (repo-specific)
22
    h.set_context('test', 'code')
38
    all_commit_ids = list(c.app.repo.all_commit_ids())
23
    refresh_repo(c.app.repo, notify=False)
24
    h.set_context('test', 'code2')
25
    refresh_repo(c.app.repo, notify=False)
39
26
40
    # Skip commits that are already in the DB (repo-agnostic)
41
    commit_ids = unknown_commit_ids(all_commit_ids)
42
    # commit_ids = commit_ids[:500]
43
    log.info('Refreshing %d commits', len(commit_ids))
44
45
    # Refresh commits (repo-specific)
46
    seen = set()
47
    for i, oid in enumerate(commit_ids):
48
        c.app.repo.refresh_commit_info(oid, seen)
49
        if (i+1) % 100 == 0:
50
            log.info('Refresh commit info %d: %s', (i+1), oid)
51
52
    #############################################
53
    # Everything below here is repo-agnostic
54
    #############################################
55
56
    refresh_repo(commit_ids, c.app.repo)
57
58
    # Refresh child references
59
    seen = set()
60
    parents = set()
61
62
    for i, oid in enumerate(commit_ids):
63
        ci = CommitDoc.m.find(dict(_id=oid), validate=False).next()
64
        refresh_children(ci)
65
        seen.add(ci._id)
66
        parents.update(ci.parent_ids)
67
        if (i+1) % 100 == 0:
68
            log.info('Refresh child (a) info %d: %s', (i+1), ci._id)
69
    for j, oid in enumerate(parents-seen):
70
        try:
71
            ci = CommitDoc.m.find(dict(_id=oid), validate=False).next()
72
        except StopIteration:
73
            continue
74
        refresh_children(ci)
75
        if (i + j + 1) % 100 == 0:
76
            log.info('Refresh child (b) info %d: %s', (i + j + 1), ci._id)
77
78
    # Refresh commit runs
79
    rb = CommitRunBuilder(commit_ids)
80
    rb.run()
81
    rb.cleanup()
82
83
    # Refresh trees
84
    cache = {}
85
    for i, oid in enumerate(commit_ids):
86
        ci = CommitDoc.m.find(dict(_id=oid), validate=False).next()
87
        cache = refresh_commit_trees(ci, cache)
88
        if (i+1) % 100 == 0:
89
            log.info('Refresh commit trees %d: %s', (i+1), ci._id)
90
91
    # Compute diffs
92
    cache = {}
93
    for i, oid in enumerate(commit_ids):
94
        ci = CommitDoc.m.find(dict(_id=oid), validate=False).next()
95
        compute_diffs(c.app.repo._id, cache, ci)
96
        if (i+1) % 100 == 0:
97
            log.info('Compute diffs %d: %s', (i+1), ci._id)
98
99
def refresh_commit_trees(ci, cache):
100
    trees_doc = TreesDoc(dict(
101
            _id=ci._id,
102
            tree_ids = list(trees(ci.tree_id, cache))))
103
    trees_doc.m.save(safe=False)
104
    new_cache = dict(
105
        (oid, cache[oid])
106
        for oid in trees_doc.tree_ids)
107
    return new_cache
108
109
def refresh_commit_info(ci, seen):
110
    if CommitDoc.m.find(dict(_id=ci.hexsha)).count() != 0:
111
        return False
112
    try:
113
        ci_doc = CommitDoc(dict(
114
                _id=ci.hexsha,
115
                tree_id=ci.tree.hexsha,
116
                committed = Object(
117
                    name=h.really_unicode(ci.committer.name),
118
                    email=h.really_unicode(ci.committer.email),
119
                    date=datetime.utcfromtimestamp(
120
                        ci.committed_date-ci.committer_tz_offset)),
121
                authored = Object(
122
                    name=h.really_unicode(ci.author.name),
123
                    email=h.really_unicode(ci.author.email),
124
                    date=datetime.utcfromtimestamp(
125
                        ci.authored_date-ci.author_tz_offset)),
126
                message=h.really_unicode(ci.message or ''),
127
                child_ids=[],
128
                parent_ids = [ p.hexsha for p in ci.parents ]))
129
        ci_doc.m.insert(safe=True)
130
    except DuplicateKeyError:
131
        return False
132
    refresh_tree(ci.tree, seen)
133
    return True
134
135
def refresh_repo(commit_ids, repo):
136
    for oids in utils.chunked_iter(commit_ids, QSIZE):
137
        oids = list(oids)
138
        # Create shortlinks and artifactrefs
139
        for oid in oids:
140
            index_id = 'allura.model.repo.Commit#' + oid
141
            ref = ArtifactReferenceDoc(dict(
142
                    _id=index_id,
143
                    artifact_reference=dict(
144
                        cls=dumps(Commit),
145
                        project_id=repo.app.config.project_id,
146
                    app_config_id=repo.app.config._id,
147
                        artifact_id=oid),
148
                    references=[]))
149
            link = ShortlinkDoc(dict(
150
                    _id=bson.ObjectId(),
151
                    ref_id=index_id,
152
                    project_id=repo.app.config.project_id,
153
                    app_config_id=repo.app.config._id,
154
                    link=repo.shorthand_for_commit(oid),
155
                    url=repo.url() + 'ci/' + oid + '/'))
156
            ref.m.save(safe=False, validate=False)
157
            link.m.save(safe=False, validate=False)
158
        CommitDoc.m.update_partial(
159
            dict(
160
                _id={'$in': oids},
161
                repo_ids={'$ne': repo._id}),
162
            {'$addToSet': dict(repo_ids=repo._id)},
163
            multi=True)
164
165
def refresh_children(ci):
166
    CommitDoc.m.update_partial(
167
        dict(_id={'$in': ci.parent_ids}),
168
        {'$addToSet': dict(child_ids=ci._id)},
169
        multi=True)
170
171
class CommitRunBuilder(object):
172
173
    def __init__(self, commit_ids):
174
        self.commit_ids = commit_ids
175
        self.run_index = {} # by commit ID
176
        self.runs = {}          # by run ID
177
        self.reasons = {}    # reasons to stop merging runs
178
179
    def run(self):
180
        for oids in utils.chunked_iter(self.commit_ids, QSIZE):
181
            oids = list(oids)
182
            commits = list(CommitDoc.m.find(dict(_id={'$in':oids})))
183
            for ci in commits:
184
                if ci._id in self.run_index: continue
185
                self.run_index[ci._id] = ci._id
186
                self.runs[ci._id] = CommitRunDoc(dict(
187
                        _id=ci._id,
188
                        parent_commit_ids=ci.parent_ids,
189
                        commit_ids=[ci._id],
190
                        commit_times=[ci.authored.date]))
191
            self.merge_runs()
192
        log.info('%d runs', len(self.runs))
193
        for rid, run in sorted(self.runs.items()):
194
            log.info('%32s: %r', self.reasons.get(rid, 'none'), run._id)
195
        for run in self.runs.itervalues():
196
            run.m.save()
197
        return self.runs
198
199
    def _all_runs(self):
200
        runs = {}
201
        for oids in utils.chunked_iter(self.commit_ids, QSIZE):
202
            oids = list(oids)
203
            for run in CommitRunDoc.m.find(dict(commit_ids={'$in': oids})):
204
                runs[run._id] = run
205
        seen_run_ids = set()
206
        runs = runs.values()
207
        while runs:
208
            run = runs.pop()
209
            if run._id in seen_run_ids: continue
210
            seen_run_ids.add(run._id)
211
            yield run
212
            for run in CommitRunDoc.m.find(
213
                dict(commit_ids={'$in':run.parent_commit_ids})):
214
                runs.append(run)
215
216
    def cleanup(self):
217
        '''Delete non-maximal runs'''
218
        for run1 in self._all_runs():
219
            for run2 in CommitRunDoc.m.find(dict(
220
                    commit_ids=run1.commit_ids[0])):
221
                if run1._id == run2._id: continue
222
                log.info('... delete %r (part of %r)', run2, run1)
223
                run2.m.delete()
224
225
    def merge_runs(self):
226
        while True:
227
            for run_id, run in self.runs.iteritems():
228
                if len(run.parent_commit_ids) != 1:
229
                    self.reasons[run_id] = '%d parents' % len(run.parent_commit_ids)
230
                    continue
231
                p_oid = run.parent_commit_ids[0]
232
                p_run_id = self.run_index.get(p_oid)
233
                if p_run_id is None:
234
                    self.reasons[run_id] = 'parent commit not found'
235
                    continue
236
                p_run = self.runs.get(p_run_id)
237
                if p_run is None:
238
                    self.reasons[run_id] = 'parent run not found'
239
                    continue
240
                if p_run.commit_ids[0] != p_oid:
241
                    self.reasons[run_id] = 'parent does not start with parent commit'
242
                    continue
243
                run.commit_ids += p_run.commit_ids
244
                run.commit_times += p_run.commit_times
245
                run.parent_commit_ids = p_run.parent_commit_ids
246
                for oid in p_run.commit_ids:
247
                    self.run_index[oid] = run_id
248
                break
249
            else:
250
                break
251
            del self.runs[p_run_id]
252
253
def refresh_tree(t, seen):
254
    if t.binsha in seen: return
255
    seen.add(t.binsha)
256
    doc = TreeDoc(dict(
257
            _id=t.hexsha,
258
            tree_ids=[],
259
            blob_ids=[],
260
            other_ids=[]))
261
    for o in t:
262
        obj = Object(
263
            name=h.really_unicode(o.name),
264
            id=o.hexsha)
265
        if o.type == 'tree':
266
            refresh_tree(o, seen)
267
            doc.tree_ids.append(obj)
268
        elif o.type == 'blob':
269
            doc.blob_ids.append(obj)
270
        else:
271
            obj.type = o.type
272
            doc.other_ids.append(obj)
273
    doc.m.save(safe=False)
274
275
def trees(id, cache):
276
    yield id
277
    entries = cache.get(id, None)
278
    if entries is None:
279
        t = TreeDoc.m.get(_id=id)
280
        entries = [ o.id for o in t.tree_ids ]
281
        cache[id] = entries
282
    for i in entries:
283
        for x in trees(i, cache):
284
            yield x
285
286
def unknown_commit_ids(all_commit_ids):
287
    result = []
288
    for chunk in utils.chunked_iter(all_commit_ids, QSIZE):
289
        q = CommitDoc.m.find(_id={'$in':chunk})
290
        known_commit_ids = set(ci._id for ci in q)
291
        result += [ oid for oid in chunk if oid not in known_commit_ids ]
292
    return result
293
294
def compute_diffs(repo_id, tree_cache, rhs_ci):
295
    def _walk_tree(tree, tree_index):
296
        for x in tree.blob_ids: yield x.id
297
        for x in tree.other_ids: yield x.id
298
        for x in tree.tree_ids:
299
            yield x.id
300
            for xx in _walk_tree(tree_index[x.id], tree_index):
301
                yield xx
302
303
    rhs_tree_ids = TreesDoc.m.get(_id=rhs_ci._id).tree_ids
304
    if rhs_ci.parent_ids:
305
        lhs_ci = CommitDoc.m.get(_id=rhs_ci.parent_ids[0])
306
    else:
307
        lhs_ci = None
308
    if lhs_ci is not None:
309
        lhs_tree_ids = TreesDoc.m.get(_id=lhs_ci._id).tree_ids
310
    else:
311
        lhs_tree_ids = []
312
    new_tree_ids = [
313
        tid for tid in chain(lhs_tree_ids, rhs_tree_ids)
314
        if tid not in tree_cache ]
315
    tree_index = dict(
316
        (t._id, t) for t in TreeDoc.m.find(dict(_id={'$in': new_tree_ids}),validate=False))
317
    tree_index.update(tree_cache)
318
    rhs_tree_ids_set = set(rhs_tree_ids)
319
    tree_cache.clear()
320
    tree_cache.update(
321
        (id, t) for id,t in tree_index.iteritems() if id in rhs_tree_ids_set)
322
    rhs_tree = tree_index[rhs_ci.tree_id]
323
    if lhs_ci is None:
324
        lhs_tree = Object(_id=None, tree_ids=[], blob_ids=[], other_ids=[])
325
    else:
326
        lhs_tree = tree_index[lhs_ci.tree_id]
327
    differences = []
328
    for name, lhs_id, rhs_id in _diff_trees(lhs_tree, rhs_tree, tree_index):
329
        differences.append(
330
            dict(name=name, lhs_id=lhs_id, rhs_id=rhs_id))
331
        # Set last commit info
332
        if rhs_id is not None:
333
            _set_last_commit(repo_id, rhs_id, rhs_ci)
334
        rhs_tree = tree_index.get(rhs_id, None)
335
        if rhs_tree is not None:
336
            for oid in _walk_tree(rhs_tree, tree_index):
337
                _set_last_commit(repo_id, oid, rhs_ci)
338
    di = DiffInfoDoc(dict(
339
            _id=rhs_ci._id,
340
            differences=differences))
341
    di.m.save()
342
    return tree_cache
343
344
def _diff_trees(lhs, rhs, index, *path):
345
    def _fq(name):
346
        return '/'.join(reversed(
347
                (name,) + path))
348
    # Diff the trees
349
    rhs_tree_ids = dict(
350
        (o.name, o.id)
351
        for o in rhs.tree_ids)
352
    for o in lhs.tree_ids:
353
        rhs_id = rhs_tree_ids.pop(o.name, None)
354
        if rhs_id == o.id:
355
            continue # no change
356
        elif rhs_id is None:
357
            yield (_fq(o.name), o.id, None)
358
        else:
359
            for difference in _diff_trees(
360
                index[o.id], index[rhs_id], index,
361
                o.name, *path):
362
                yield difference
363
    for name, id in rhs_tree_ids.items():
364
        yield (_fq(name), None, id)
365
    # DIff the blobs
366
    rhs_blob_ids = dict(
367
        (o.name, o.id)
368
        for o in rhs.blob_ids)
369
    for o in lhs.blob_ids:
370
        rhs_id = rhs_blob_ids.pop(o.name, None)
371
        if rhs_id == o.id:
372
            continue # no change
373
        elif rhs_id is None:
374
            yield (_fq(o.name), o.id, None)
375
        else:
376
            yield (_fq(o.name), o.id, rhs_id)
377
    for name, id in rhs_blob_ids.items():
378
        yield (_fq(name), None, id)
379
380
def _set_last_commit(repo_id, oid, commit):
381
    lc = LastCommitDoc(dict(
382
            _id='%s:%s' % (repo_id, oid),
383
            repo_id=repo_id,
384
            object_id=oid,
385
            commit_info=dict(
386
                id=commit._id,
387
                author=commit.authored.name,
388
                author_email=commit.authored.email,
389
                date=commit.authored.date,
390
                # author_url=commit.author_url,
391
                # href=commit.url(),
392
                # shortlink=commit.shorthand_id(),
393
                # summary=commit.summary
394
                )))
395
    lc.m.save(safe=False)
396
    return lc
397
27
398
if __name__ == '__main__':
28
if __name__ == '__main__':
399
    main()
29
    main()
400
    # dolog()
30
    # dolog()