--- a
+++ b/Allura/allura/model/repo_refresh.py
@@ -0,0 +1,395 @@
+import logging
+from itertools import chain
+from cPickle import dumps
+
+import bson
+from tg import config
+
+from ming.base import Object
+
+from allura.lib import utils
+from allura.model.repo import CommitDoc, TreeDoc, TreesDoc, DiffInfoDoc
+from allura.model.repo import LastCommitDoc, CommitRunDoc
+from allura.model.repo import Commit
+from allura.model.index import ArtifactReferenceDoc, ShortlinkDoc
+
+log = logging.getLogger(__name__)
+
+QSIZE=100
+
+def refresh_repo(repo, all_commits=False, notify=True):
+ all_commit_ids = commit_ids = list(repo.all_commit_ids())
+ if not all_commits:
+ # Skip commits that are already in the DB
+ commit_ids = unknown_commit_ids(commit_ids)
+ log.info('Refreshing %d commits', len(commit_ids))
+
+ # Refresh commits
+ seen = set()
+ for i, oid in enumerate(commit_ids):
+ repo.refresh_commit_info(oid, seen)
+ if (i+1) % 100 == 0:
+ log.info('Refresh commit info %d: %s', (i+1), oid)
+
+ refresh_commit_repos(all_commit_ids, repo)
+
+ # Refresh child references
+ seen = set()
+ parents = set()
+
+ for i, oid in enumerate(commit_ids):
+ ci = CommitDoc.m.find(dict(_id=oid), validate=False).next()
+ refresh_children(ci)
+ seen.add(ci._id)
+ parents.update(ci.parent_ids)
+ if (i+1) % 100 == 0:
+ log.info('Refresh child (a) info %d: %s', (i+1), ci._id)
+ for j, oid in enumerate(parents-seen):
+ try:
+ ci = CommitDoc.m.find(dict(_id=oid), validate=False).next()
+ except StopIteration:
+ continue
+ refresh_children(ci)
+ if (i + j + 1) % 100 == 0:
+ log.info('Refresh child (b) info %d: %s', (i + j + 1), ci._id)
+
+ # Refresh commit runs
+ rb = CommitRunBuilder(commit_ids)
+ rb.run()
+ rb.cleanup()
+
+ # Refresh trees
+ cache = {}
+ for i, oid in enumerate(commit_ids):
+ ci = CommitDoc.m.find(dict(_id=oid), validate=False).next()
+ cache = refresh_commit_trees(ci, cache)
+ if (i+1) % 100 == 0:
+ log.info('Refresh commit trees %d: %s', (i+1), ci._id)
+
+ # Compute diffs
+ cache = {}
+ for i, oid in enumerate(commit_ids):
+ ci = CommitDoc.m.find(dict(_id=oid), validate=False).next()
+ compute_diffs(repo._id, cache, ci)
+ if (i+1) % 100 == 0:
+ log.info('Compute diffs %d: %s', (i+1), ci._id)
+
+ # Send notifications
+ if notify:
+ send_notifications(commit_ids)
+
+def refresh_commit_trees(ci, cache):
+ '''Refresh the list of trees included withn a commit'''
+ trees_doc = TreesDoc(dict(
+ _id=ci._id,
+ tree_ids = list(trees(ci.tree_id, cache))))
+ trees_doc.m.save(safe=False)
+ new_cache = dict(
+ (oid, cache[oid])
+ for oid in trees_doc.tree_ids)
+ return new_cache
+
+def refresh_commit_repos(all_commit_ids, repo):
+ '''Refresh the list of repositories within which a set of commits are
+ contained'''
+ for oids in utils.chunked_iter(all_commit_ids, QSIZE):
+ for ci in CommitDoc.m.find(dict(
+ _id={'$in':list(oids)},
+ repo_ids={'$ne': repo._id})):
+ oid = ci._id
+ ci.repo_ids.append(repo._id)
+ index_id = 'allura.model.repo.Commit#' + oid
+ ref = ArtifactReferenceDoc(dict(
+ _id=index_id,
+ artifact_reference=dict(
+ cls=dumps(Commit),
+ project_id=repo.app.config.project_id,
+ app_config_id=repo.app.config._id,
+ artifact_id=oid),
+ references=[]))
+ link = ShortlinkDoc(dict(
+ _id=bson.ObjectId(),
+ ref_id=index_id,
+ project_id=repo.app.config.project_id,
+ app_config_id=repo.app.config._id,
+ link=repo.shorthand_for_commit(oid)[1:-1],
+ url=repo.url() + 'ci/' + oid + '/'))
+ # Always create a link for the full commit ID
+ link = ShortlinkDoc(dict(
+ _id=bson.ObjectId(),
+ ref_id=index_id,
+ project_id=repo.app.config.project_id,
+ app_config_id=repo.app.config._id,
+ link=oid,
+ url=repo.url() + 'ci/' + oid + '/'))
+ ref.m.save(safe=False, validate=False)
+ link.m.save(safe=False, validate=False)
+
+def refresh_children(ci):
+ '''Refresh the list of children of the given commit'''
+ CommitDoc.m.update_partial(
+ dict(_id={'$in': ci.parent_ids}),
+ {'$addToSet': dict(child_ids=ci._id)},
+ multi=True)
+
+class CommitRunBuilder(object):
+ '''Class used to build up linear runs of single-parent commits'''
+
+ def __init__(self, commit_ids):
+ self.commit_ids = commit_ids
+ self.run_index = {} # by commit ID
+ self.runs = {} # by run ID
+ self.reasons = {} # reasons to stop merging runs
+
+ def run(self):
+ '''Build up the runs'''
+ for oids in utils.chunked_iter(self.commit_ids, QSIZE):
+ oids = list(oids)
+ for ci in CommitDoc.m.find(dict(_id={'$in':oids})):
+ if ci._id in self.run_index: continue
+ self.run_index[ci._id] = ci._id
+ self.runs[ci._id] = CommitRunDoc(dict(
+ _id=ci._id,
+ parent_commit_ids=ci.parent_ids,
+ commit_ids=[ci._id],
+ commit_times=[ci.authored.date]))
+ self.merge_runs()
+ log.info('%d runs', len(self.runs))
+ for rid, run in sorted(self.runs.items()):
+ log.info('%32s: %r', self.reasons.get(rid, 'none'), run._id)
+ for run in self.runs.itervalues():
+ run.m.save()
+ return self.runs
+
+ def _all_runs(self):
+ '''Find all runs containing this builder's commit IDs'''
+ runs = {}
+ for oids in utils.chunked_iter(self.commit_ids, QSIZE):
+ oids = list(oids)
+ for run in CommitRunDoc.m.find(dict(commit_ids={'$in': oids})):
+ runs[run._id] = run
+ seen_run_ids = set()
+ runs = runs.values()
+ while runs:
+ run = runs.pop()
+ if run._id in seen_run_ids: continue
+ seen_run_ids.add(run._id)
+ yield run
+ for run in CommitRunDoc.m.find(
+ dict(commit_ids={'$in':run.parent_commit_ids})):
+ runs.append(run)
+
+ def cleanup(self):
+ '''Delete non-maximal runs'''
+ for run1 in self._all_runs():
+ for run2 in CommitRunDoc.m.find(dict(
+ commit_ids=run1.commit_ids[0])):
+ if run1._id == run2._id: continue
+ log.info('... delete %r (part of %r)', run2, run1)
+ run2.m.delete()
+
+ def merge_runs(self):
+ '''Find partial runs that may be merged and merge them'''
+ while True:
+ for run_id, run in self.runs.iteritems():
+ if len(run.parent_commit_ids) != 1:
+ self.reasons[run_id] = '%d parents' % len(run.parent_commit_ids)
+ continue
+ p_oid = run.parent_commit_ids[0]
+ p_run_id = self.run_index.get(p_oid)
+ if p_run_id is None:
+ self.reasons[run_id] = 'parent commit not found'
+ continue
+ p_run = self.runs.get(p_run_id)
+ if p_run is None:
+ self.reasons[run_id] = 'parent run not found'
+ continue
+ if p_run.commit_ids[0] != p_oid:
+ self.reasons[run_id] = 'parent does not start with parent commit'
+ continue
+ run.commit_ids += p_run.commit_ids
+ run.commit_times += p_run.commit_times
+ run.parent_commit_ids = p_run.parent_commit_ids
+ for oid in p_run.commit_ids:
+ self.run_index[oid] = run_id
+ break
+ else:
+ break
+ del self.runs[p_run_id]
+
+def trees(id, cache):
+ '''Recursively generate the list of trees contained within a given tree ID'''
+ yield id
+ entries = cache.get(id, None)
+ if entries is None:
+ t = TreeDoc.m.get(_id=id)
+ entries = [ o.id for o in t.tree_ids ]
+ cache[id] = entries
+ for i in entries:
+ for x in trees(i, cache):
+ yield x
+
+def unknown_commit_ids(all_commit_ids):
+ '''filter out all commit ids that have already been cached'''
+ result = []
+ for chunk in utils.chunked_iter(all_commit_ids, QSIZE):
+ q = CommitDoc.m.find(_id={'$in':chunk})
+ known_commit_ids = set(ci._id for ci in q)
+ result += [ oid for oid in chunk if oid not in known_commit_ids ]
+ return result
+
+def compute_diffs(repo_id, tree_cache, rhs_ci):
+ '''compute simple differences between a commit and its first parent'''
+ def _walk_tree(tree, tree_index):
+ for x in tree.blob_ids: yield x.id
+ for x in tree.other_ids: yield x.id
+ for x in tree.tree_ids:
+ yield x.id
+ for xx in _walk_tree(tree_index[x.id], tree_index):
+ yield xx
+
+ rhs_tree_ids = TreesDoc.m.get(_id=rhs_ci._id).tree_ids
+ if rhs_ci.parent_ids:
+ lhs_ci = CommitDoc.m.get(_id=rhs_ci.parent_ids[0])
+ else:
+ lhs_ci = None
+ if lhs_ci is not None:
+ lhs_tree_ids = TreesDoc.m.get(_id=lhs_ci._id).tree_ids
+ else:
+ lhs_tree_ids = []
+ new_tree_ids = [
+ tid for tid in chain(lhs_tree_ids, rhs_tree_ids)
+ if tid not in tree_cache ]
+ tree_index = dict(
+ (t._id, t) for t in TreeDoc.m.find(dict(_id={'$in': new_tree_ids}),validate=False))
+ tree_index.update(tree_cache)
+ rhs_tree_ids_set = set(rhs_tree_ids)
+ tree_cache.clear()
+ tree_cache.update(
+ (id, t) for id,t in tree_index.iteritems() if id in rhs_tree_ids_set)
+ rhs_tree = tree_index[rhs_ci.tree_id]
+ if lhs_ci is None:
+ lhs_tree = Object(_id=None, tree_ids=[], blob_ids=[], other_ids=[])
+ else:
+ lhs_tree = tree_index[lhs_ci.tree_id]
+ differences = []
+ for name, lhs_id, rhs_id in _diff_trees(lhs_tree, rhs_tree, tree_index):
+ differences.append(
+ dict(name=name, lhs_id=lhs_id, rhs_id=rhs_id))
+ # Set last commit info
+ if rhs_id is not None:
+ _set_last_commit(repo_id, rhs_id, rhs_ci)
+ rhs_tree = tree_index.get(rhs_id, None)
+ if rhs_tree is not None:
+ for oid in _walk_tree(rhs_tree, tree_index):
+ _set_last_commit(repo_id, oid, rhs_ci)
+ di = DiffInfoDoc(dict(
+ _id=rhs_ci._id,
+ differences=differences))
+ di.m.save()
+ return tree_cache
+
+def send_notifications(repo, commit_ids):
+ '''Create appropriate notification and feed objects for a refresh'''
+ from allura.model import Feed, Notification
+ commit_msgs = []
+ for oids in utils.chunked_iter(commit_ids, QSIZE):
+ chunk = list(oids)
+ index = dict(
+ (doc._id, doc)
+ for doc in CommitDoc.m.find(dict(_id={'$in':chunk})))
+ for oid in chunk:
+ ci = index[oid]
+ href = '%s%sci/%s/' % (
+ config.common_prefix,
+ repo.url(),
+ oid)
+ summary = _summarize(ci.message)
+ item = Feed.post(
+ repo, title='New commit',
+ description='%s<br><a href="%s/">View Changes</a>' % (
+ summary, href))
+ item.author_link = ci.author_url
+ item.author_name = ci.authored.name
+ commit_msgs.append('%s by %s <%s>' % (
+ summary, ci.authored.name, href))
+ if commit_msgs:
+ if len(commit_msgs) > 1:
+ subject = '%d new commits to %s %s' % (
+ len(commit_msgs), repo.app.project.name, repo.app.config.options.mount_label)
+ text='\n\n'.join(commit_msgs)
+ else:
+ subject = '%s committed to %s %s: %s' % (
+ ci.authored.name,
+ repo.app.project.name,
+ repo.app.config.options.mount_label,
+ summary)
+ text = ci.message
+ Notification.post(
+ artifact=repo,
+ topic='metadata',
+ subject=subject,
+ text=text)
+
+def _summarize(message):
+ if not message: return ''
+ summary = []
+ for line in message.splitlines():
+ line = line.rstrip()
+ if line: summary.append(line)
+ else: break
+ return ' '.join(summary)
+
+def _diff_trees(lhs, rhs, index, *path):
+ def _fq(name):
+ return '/'.join(reversed(
+ (name,) + path))
+ # Diff the trees
+ rhs_tree_ids = dict(
+ (o.name, o.id)
+ for o in rhs.tree_ids)
+ for o in lhs.tree_ids:
+ rhs_id = rhs_tree_ids.pop(o.name, None)
+ if rhs_id == o.id:
+ continue # no change
+ elif rhs_id is None:
+ yield (_fq(o.name), o.id, None)
+ else:
+ for difference in _diff_trees(
+ index[o.id], index[rhs_id], index,
+ o.name, *path):
+ yield difference
+ for name, id in rhs_tree_ids.items():
+ yield (_fq(name), None, id)
+ # DIff the blobs
+ rhs_blob_ids = dict(
+ (o.name, o.id)
+ for o in rhs.blob_ids)
+ for o in lhs.blob_ids:
+ rhs_id = rhs_blob_ids.pop(o.name, None)
+ if rhs_id == o.id:
+ continue # no change
+ elif rhs_id is None:
+ yield (_fq(o.name), o.id, None)
+ else:
+ yield (_fq(o.name), o.id, rhs_id)
+ for name, id in rhs_blob_ids.items():
+ yield (_fq(name), None, id)
+
+def _set_last_commit(repo_id, oid, commit):
+ lc = LastCommitDoc(dict(
+ _id='%s:%s' % (repo_id, oid),
+ repo_id=repo_id,
+ object_id=oid,
+ commit_info=dict(
+ id=commit._id,
+ author=commit.authored.name,
+ author_email=commit.authored.email,
+ date=commit.authored.date,
+ # author_url=commit.author_url,
+ # href=commit.url(),
+ # shortlink=commit.shorthand_id(),
+ # summary=commit.summary
+ )))
+ lc.m.save(safe=False)
+ return lc