Switch to side-by-side view

--- a/Allura/test-light.py
+++ b/Allura/test-light.py
@@ -3,60 +3,49 @@
 from collections import defaultdict
 from itertools import chain, izip
 from datetime import datetime
-
+from cPickle import dumps
+
+import bson
 from pylons import c
 from pymongo.errors import DuplicateKeyError
 
 from ming.base import Object
 
-from allura import model as M
 from allura.lib import helpers as h
 from allura.lib import utils
+from allura.model.repo import CommitDoc, TreeDoc, TreesDoc, DiffInfoDoc
+from allura.model.repo import LastCommitDoc, CommitRunDoc
+from allura.model.repo import Commit
+from allura.model.index import ArtifactReferenceDoc, ShortlinkDoc
 
 log = logging.getLogger(__name__)
 
 QSIZE=100
-
-def dolog():
-    h.set_context('test', 'code')
-    repo = c.app.repo._impl._git
-    oid = repo.commit(repo.heads[0]).hexsha
-    log.info('start')
-    for i, ci in enumerate(commitlog(oid)):
-        print repr(ci)
-    log.info('done')
 
 def main():
     if len(sys.argv) > 1:
         h.set_context('test')
         c.project.install_app('Git', 'code', 'Code', init_from_url='/home/rick446/src/forge')
     h.set_context('test', 'code')
-    M.repo.Commit.m.remove({})
-    M.repo.Tree.m.remove({})
-    M.repo.Trees.m.remove({})
-    M.repo.DiffInfo.m.remove({})
-    M.repo.LastCommit.m.remove({})
-    M.repo.BasicBlock.m.remove({})
-    repo = c.app.repo._impl._git
-
-    # Get all commits
-    seen = set()
-    all_commit_ids = []
-    for head in repo.heads:
-        for ci in repo.iter_commits(head, topo_order=True):
-            if ci.binsha in seen: continue
-            seen.add(ci.binsha)
-            all_commit_ids.append(ci.hexsha)
-
-    # Skip commits that are already in the DB
+    CommitDoc.m.remove({})
+    TreeDoc.m.remove({})
+    TreesDoc.m.remove({})
+    DiffInfoDoc.m.remove({})
+    LastCommitDoc.m.remove({})
+    CommitRunDoc.m.remove({})
+
+    # Get all commits (repo-specific)
+    all_commit_ids = list(c.app.repo.all_commit_ids())
+
+    # Skip commits that are already in the DB (repo-agnostic)
     commit_ids = unknown_commit_ids(all_commit_ids)
     # commit_ids = commit_ids[:500]
     log.info('Refreshing %d commits', len(commit_ids))
 
-    # Refresh commits
+    # Refresh commits (repo-specific)
+    seen = set()
     for i, oid in enumerate(commit_ids):
-        ci = repo.rev_parse(oid)
-        refresh_commit_info(ci, seen)
+        c.app.repo.refresh_commit_info(oid, seen)
         if (i+1) % 100 == 0:
             log.info('Refresh commit info %d: %s', (i+1), oid)
 
@@ -64,14 +53,14 @@
     # Everything below here is repo-agnostic
     #############################################
 
-    refresh_repo(commit_ids, c.app.repo._id)
+    refresh_repo(commit_ids, c.app.repo)
 
     # Refresh child references
     seen = set()
     parents = set()
 
     for i, oid in enumerate(commit_ids):
-        ci = M.repo.Commit.m.find(dict(_id=oid), validate=False).next()
+        ci = CommitDoc.m.find(dict(_id=oid), validate=False).next()
         refresh_children(ci)
         seen.add(ci._id)
         parents.update(ci.parent_ids)
@@ -79,28 +68,22 @@
             log.info('Refresh child (a) info %d: %s', (i+1), ci._id)
     for j, oid in enumerate(parents-seen):
         try:
-            ci = M.repo.Commit.m.find(dict(_id=oid), validate=False).next()
+            ci = CommitDoc.m.find(dict(_id=oid), validate=False).next()
         except StopIteration:
             continue
         refresh_children(ci)
         if (i + j + 1) % 100 == 0:
             log.info('Refresh child (b) info %d: %s', (i + j + 1), ci._id)
 
-    # Refresh basic blocks
-    bbb = BasicBlockBuilder(commit_ids)
-    bbb.run()
-    bbb.cleanup()
-
-    # Verify the log
-    log.info('Logging via basic blocks')
-    for i, ci in enumerate(commitlog(commit_ids[0])):
-        pass
-    log.info('... done (%d commits from %s)', i+1, commit_ids[0])
+    # Refresh commit runs
+    rb = CommitRunBuilder(commit_ids)
+    rb.run()
+    rb.cleanup()
 
     # Refresh trees
     cache = {}
     for i, oid in enumerate(commit_ids):
-        ci = M.repo.Commit.m.find(dict(_id=oid), validate=False).next()
+        ci = CommitDoc.m.find(dict(_id=oid), validate=False).next()
         cache = refresh_commit_trees(ci, cache)
         if (i+1) % 100 == 0:
             log.info('Refresh commit trees %d: %s', (i+1), ci._id)
@@ -108,13 +91,13 @@
     # Compute diffs
     cache = {}
     for i, oid in enumerate(commit_ids):
-        ci = M.repo.Commit.m.find(dict(_id=oid), validate=False).next()
+        ci = CommitDoc.m.find(dict(_id=oid), validate=False).next()
         compute_diffs(c.app.repo._id, cache, ci)
         if (i+1) % 100 == 0:
             log.info('Compute diffs %d: %s', (i+1), ci._id)
 
 def refresh_commit_trees(ci, cache):
-    trees_doc = M.repo.Trees(dict(
+    trees_doc = TreesDoc(dict(
             _id=ci._id,
             tree_ids = list(trees(ci.tree_id, cache))))
     trees_doc.m.save(safe=False)
@@ -124,10 +107,10 @@
     return new_cache
 
 def refresh_commit_info(ci, seen):
-    if M.repo.Commit.m.find(dict(_id=ci.hexsha)).count() != 0:
+    if CommitDoc.m.find(dict(_id=ci.hexsha)).count() != 0:
         return False
     try:
-        ci_doc = M.repo.Commit(dict(
+        ci_doc = CommitDoc(dict(
                 _id=ci.hexsha,
                 tree_id=ci.tree.hexsha,
                 committed = Object(
@@ -149,109 +132,128 @@
     refresh_tree(ci.tree, seen)
     return True
 
-def refresh_repo(commit_ids, repo_id):
+def refresh_repo(commit_ids, repo):
     for oids in utils.chunked_iter(commit_ids, QSIZE):
         oids = list(oids)
-        M.repo.Commit.m.update_partial(
+        # Create shortlinks and artifactrefs
+        for oid in oids:
+            index_id = 'allura.model.repo.Commit#' + oid
+            ref = ArtifactReferenceDoc(dict(
+                    _id=index_id,
+                    artifact_reference=dict(
+                        cls=dumps(Commit),
+                        project_id=repo.app.config.project_id,
+                    app_config_id=repo.app.config._id,
+                        artifact_id=oid),
+                    references=[]))
+            link = ShortlinkDoc(dict(
+                    _id=bson.ObjectId(),
+                    ref_id=index_id,
+                    project_id=repo.app.config.project_id,
+                    app_config_id=repo.app.config._id,
+                    link=repo.shorthand_for_commit(oid),
+                    url=repo.url() + 'ci/' + oid + '/'))
+            ref.m.save(safe=False, validate=False)
+            link.m.save(safe=False, validate=False)
+        CommitDoc.m.update_partial(
             dict(
                 _id={'$in': oids},
-                repo_ids={'$ne': repo_id}),
-            {'$addToSet': dict(repo_ids=repo_id)},
+                repo_ids={'$ne': repo._id}),
+            {'$addToSet': dict(repo_ids=repo._id)},
             multi=True)
 
 def refresh_children(ci):
-    M.repo.Commit.m.update_partial(
+    CommitDoc.m.update_partial(
         dict(_id={'$in': ci.parent_ids}),
         {'$addToSet': dict(child_ids=ci._id)},
         multi=True)
 
-class BasicBlockBuilder(object):
+class CommitRunBuilder(object):
 
     def __init__(self, commit_ids):
         self.commit_ids = commit_ids
-        self.block_index = {} # by commit ID
-        self.blocks = {}          # by block ID
-        self.reasons = {}        # reasons to stop merging blocks
+        self.run_index = {} # by commit ID
+        self.runs = {}          # by run ID
+        self.reasons = {}    # reasons to stop merging runs
 
     def run(self):
         for oids in utils.chunked_iter(self.commit_ids, QSIZE):
             oids = list(oids)
-            commits = list(M.repo.Commit.m.find(dict(_id={'$in':oids})))
+            commits = list(CommitDoc.m.find(dict(_id={'$in':oids})))
             for ci in commits:
-                if ci._id in self.block_index: continue
-                self.block_index[ci._id] = ci._id
-                self.blocks[ci._id] = M.repo.BasicBlock(dict(
+                if ci._id in self.run_index: continue
+                self.run_index[ci._id] = ci._id
+                self.runs[ci._id] = CommitRunDoc(dict(
                         _id=ci._id,
                         parent_commit_ids=ci.parent_ids,
                         commit_ids=[ci._id],
                         commit_times=[ci.authored.date]))
-            self.merge_blocks()
-        log.info('%d basic blocks', len(self.blocks))
-        for bid, bb in sorted(self.blocks.items()):
-            log.info('%32s: %r', self.reasons.get(bid, 'none'), bb)
-        for bb in self.blocks.itervalues():
-            bb.m.save()
-        return self.blocks
-
-    def _all_blocks(self):
-        blocks = {}
+            self.merge_runs()
+        log.info('%d runs', len(self.runs))
+        for rid, run in sorted(self.runs.items()):
+            log.info('%32s: %r', self.reasons.get(rid, 'none'), run._id)
+        for run in self.runs.itervalues():
+            run.m.save()
+        return self.runs
+
+    def _all_runs(self):
+        runs = {}
         for oids in utils.chunked_iter(self.commit_ids, QSIZE):
             oids = list(oids)
-            for bb in M.repo.BasicBlock.m.find(dict(commit_ids={'$in': oids})):
-                blocks[bb._id] = bb
-        seen_bids = set()
-        blocks = blocks.values()
-        while blocks:
-            bb = blocks.pop()
-            if bb._id in seen_bids: continue
-            seen_bids.add(bb._id)
-            yield bb
-            for bb in M.repo.BasicBlock.m.find(
-                dict(commit_ids={'$in':bb.parent_commit_ids})):
-                blocks.append(bb)
+            for run in CommitRunDoc.m.find(dict(commit_ids={'$in': oids})):
+                runs[run._id] = run
+        seen_run_ids = set()
+        runs = runs.values()
+        while runs:
+            run = runs.pop()
+            if run._id in seen_run_ids: continue
+            seen_run_ids.add(run._id)
+            yield run
+            for run in CommitRunDoc.m.find(
+                dict(commit_ids={'$in':run.parent_commit_ids})):
+                runs.append(run)
 
     def cleanup(self):
-        '''Delete non-maximal basic blocks'''
-        for bb1 in self._all_blocks():
-            for bb2 in M.repo.BasicBlock.m.find(dict(
-                    commit_ids=bb1.commit_ids[0])):
-                if bb2._id == bb1._id: continue
-                log.info('... delete %r (part of %r)', bb2, bb1)
-                import pdb; pdb.set_trace()
-                bb2.m.delete()
-
-    def merge_blocks(self):
+        '''Delete non-maximal runs'''
+        for run1 in self._all_runs():
+            for run2 in CommitRunDoc.m.find(dict(
+                    commit_ids=run1.commit_ids[0])):
+                if run1._id == run2._id: continue
+                log.info('... delete %r (part of %r)', run2, run1)
+                run2.m.delete()
+
+    def merge_runs(self):
         while True:
-            for bid, bb in self.blocks.iteritems():
-                if len(bb.parent_commit_ids) != 1:
-                    self.reasons[bid] = '%d parents' % len(bb.parent_commit_ids)
+            for run_id, run in self.runs.iteritems():
+                if len(run.parent_commit_ids) != 1:
+                    self.reasons[run_id] = '%d parents' % len(run.parent_commit_ids)
                     continue
-                p_oid = bb.parent_commit_ids[0]
-                p_bid = self.block_index.get(p_oid)
-                if p_bid is None:
-                    self.reasons[bid] = 'parent commit not found'
+                p_oid = run.parent_commit_ids[0]
+                p_run_id = self.run_index.get(p_oid)
+                if p_run_id is None:
+                    self.reasons[run_id] = 'parent commit not found'
                     continue
-                p_bb = self.blocks.get(p_bid)
-                if p_bb is None:
-                    self.reasons[bid] = 'parent block not found'
+                p_run = self.runs.get(p_run_id)
+                if p_run is None:
+                    self.reasons[run_id] = 'parent run not found'
                     continue
-                if p_bb.commit_ids[0] != p_oid:
-                    self.reasons[bid] = 'parent does not start with parent commit'
+                if p_run.commit_ids[0] != p_oid:
+                    self.reasons[run_id] = 'parent does not start with parent commit'
                     continue
-                bb.commit_ids += p_bb.commit_ids
-                bb.commit_times += p_bb.commit_times
-                bb.parent_commit_ids = p_bb.parent_commit_ids
-                for oid in p_bb.commit_ids:
-                    self.block_index[oid] = bid
+                run.commit_ids += p_run.commit_ids
+                run.commit_times += p_run.commit_times
+                run.parent_commit_ids = p_run.parent_commit_ids
+                for oid in p_run.commit_ids:
+                    self.run_index[oid] = run_id
                 break
             else:
                 break
-            del self.blocks[p_bid]
+            del self.runs[p_run_id]
 
 def refresh_tree(t, seen):
     if t.binsha in seen: return
     seen.add(t.binsha)
-    doc = M.repo.Tree(dict(
+    doc = TreeDoc(dict(
             _id=t.hexsha,
             tree_ids=[],
             blob_ids=[],
@@ -274,7 +276,7 @@
     yield id
     entries = cache.get(id, None)
     if entries is None:
-        t = M.repo.Tree.m.get(_id=id)
+        t = TreeDoc.m.get(_id=id)
         entries = [ o.id for o in t.tree_ids ]
         cache[id] = entries
     for i in entries:
@@ -284,7 +286,7 @@
 def unknown_commit_ids(all_commit_ids):
     result = []
     for chunk in utils.chunked_iter(all_commit_ids, QSIZE):
-        q = M.repo.Commit.m.find(_id={'$in':chunk})
+        q = CommitDoc.m.find(_id={'$in':chunk})
         known_commit_ids = set(ci._id for ci in q)
         result += [ oid for oid in chunk if oid not in known_commit_ids ]
     return result
@@ -298,20 +300,20 @@
             for xx in _walk_tree(tree_index[x.id], tree_index):
                 yield xx
 
-    rhs_tree_ids = M.repo.Trees.m.get(_id=rhs_ci._id).tree_ids
+    rhs_tree_ids = TreesDoc.m.get(_id=rhs_ci._id).tree_ids
     if rhs_ci.parent_ids:
-        lhs_ci = M.repo.Commit.m.get(_id=rhs_ci.parent_ids[0])
+        lhs_ci = CommitDoc.m.get(_id=rhs_ci.parent_ids[0])
     else:
         lhs_ci = None
     if lhs_ci is not None:
-        lhs_tree_ids = M.repo.Trees.m.get(_id=lhs_ci._id).tree_ids
+        lhs_tree_ids = TreesDoc.m.get(_id=lhs_ci._id).tree_ids
     else:
         lhs_tree_ids = []
     new_tree_ids = [
         tid for tid in chain(lhs_tree_ids, rhs_tree_ids)
         if tid not in tree_cache ]
     tree_index = dict(
-        (t._id, t) for t in M.repo.Tree.m.find(dict(_id={'$in': new_tree_ids}),validate=False))
+        (t._id, t) for t in TreeDoc.m.find(dict(_id={'$in': new_tree_ids}),validate=False))
     tree_index.update(tree_cache)
     rhs_tree_ids_set = set(rhs_tree_ids)
     tree_cache.clear()
@@ -328,82 +330,16 @@
             dict(name=name, lhs_id=lhs_id, rhs_id=rhs_id))
         # Set last commit info
         if rhs_id is not None:
-            M.repo.LastCommit.set_last_commit(repo_id, rhs_id, rhs_ci)
+            _set_last_commit(repo_id, rhs_id, rhs_ci)
         rhs_tree = tree_index.get(rhs_id, None)
         if rhs_tree is not None:
             for oid in _walk_tree(rhs_tree, tree_index):
-                M.repo.LastCommit.set_last_commit(repo_id, oid, rhs_ci)
-    di = M.repo.DiffInfo(dict(
+                _set_last_commit(repo_id, oid, rhs_ci)
+    di = DiffInfoDoc(dict(
             _id=rhs_ci._id,
             differences=differences))
     di.m.save()
     return tree_cache
-
-def commitlog(commit_id, skip=0, limit=sys.maxint):
-
-    seen = set()
-    def _visit(commit_id):
-        if commit_id in seen: return
-        bb = M.repo.BasicBlock.m.get(commit_ids=commit_id)
-        if bb is None: return
-        index = False
-        for pos, (oid, time) in enumerate(izip(bb.commit_ids, bb.commit_times)):
-            if oid == commit_id: index = True
-            elif not index: continue
-            seen.add(oid)
-            ci_times[oid] = time
-            if pos+1 < len(bb.commit_ids):
-                ci_parents[oid] = [ bb.commit_ids[pos+1] ]
-            else:
-                ci_parents[oid] = bb.parent_commit_ids
-        for oid in bb.parent_commit_ids:
-            _visit(oid)
-
-    def _gen_ids(commit_id, skip, limit):
-        # Traverse the graph in topo order, yielding commit IDs
-        commits = set([commit_id])
-        new_parent = None
-        while commits and limit:
-            # next commit is latest commit that's valid to log
-            if new_parent in commits:
-                ci = new_parent
-            else:
-                ci = max(commits, key=lambda ci:ci_times[ci])
-            commits.remove(ci)
-            if skip:
-                skip -= 1
-                continue
-            else:
-                limit -= 1
-            yield ci
-            # remove this commit from its parents children and add any childless
-            # parents to the 'ready set'
-            new_parent = None
-            for oid in ci_parents[ci]:
-                children = ci_children[oid]
-                children.discard(ci)
-                if not children:
-                    commits.add(oid)
-                    new_parent = oid
-
-    # Load all the blocks to build a commit graph
-    ci_times = {}
-    ci_parents = {}
-    ci_children = defaultdict(set)
-    log.info('Build commit graph')
-    _visit(commit_id)
-    for oid, parents in ci_parents.iteritems():
-        for ci_parent in parents:
-            ci_children[ci_parent].add(oid)
-
-    # Convert oids to commit objects
-    log.info('Traverse commit graph')
-    for oids in utils.chunked_iter(_gen_ids(commit_id, skip, limit), QSIZE):
-        oids = list(oids)
-        index = dict(
-            (ci._id, ci) for ci in M.repo.Commit.m.find(dict(_id={'$in': oids})))
-        for oid in oids:
-            yield index[oid]
 
 def _diff_trees(lhs, rhs, index, *path):
     def _fq(name):
@@ -441,6 +377,24 @@
     for name, id in rhs_blob_ids.items():
         yield (_fq(name), None, id)
 
+def _set_last_commit(repo_id, oid, commit):
+    lc = LastCommitDoc(dict(
+            _id='%s:%s' % (repo_id, oid),
+            repo_id=repo_id,
+            object_id=oid,
+            commit_info=dict(
+                id=commit._id,
+                author=commit.authored.name,
+                author_email=commit.authored.email,
+                date=commit.authored.date,
+                # author_url=commit.author_url,
+                # href=commit.url(),
+                # shortlink=commit.shorthand_id(),
+                # summary=commit.summary
+                )))
+    lc.m.save(safe=False)
+    return lc
+
 if __name__ == '__main__':
     main()
     # dolog()