--- a/Allura/test-light.py
+++ b/Allura/test-light.py
@@ -1,32 +1,17 @@
import sys
-import logging
-from collections import defaultdict
-from itertools import chain, izip
-from datetime import datetime
-from cPickle import dumps
-import bson
from pylons import c
-from pymongo.errors import DuplicateKeyError
-
-from ming.base import Object
from allura.lib import helpers as h
-from allura.lib import utils
from allura.model.repo import CommitDoc, TreeDoc, TreesDoc, DiffInfoDoc
from allura.model.repo import LastCommitDoc, CommitRunDoc
-from allura.model.repo import Commit
-from allura.model.index import ArtifactReferenceDoc, ShortlinkDoc
-
-log = logging.getLogger(__name__)
-
-QSIZE=100
+from allura.model.repo_refresh import refresh_repo
def main():
if len(sys.argv) > 1:
h.set_context('test')
c.project.install_app('Git', 'code', 'Code', init_from_url='/home/rick446/src/forge')
- h.set_context('test', 'code')
+ c.project.install_app('Hg', 'code2', 'Code2', init_from_url='/home/rick446/src/Kajiki')
CommitDoc.m.remove({})
TreeDoc.m.remove({})
TreesDoc.m.remove({})
@@ -34,366 +19,11 @@
LastCommitDoc.m.remove({})
CommitRunDoc.m.remove({})
- # Get all commits (repo-specific)
- all_commit_ids = list(c.app.repo.all_commit_ids())
+ h.set_context('test', 'code')
+ refresh_repo(c.app.repo, notify=False)
+ h.set_context('test', 'code2')
+ refresh_repo(c.app.repo, notify=False)
- # Skip commits that are already in the DB (repo-agnostic)
- commit_ids = unknown_commit_ids(all_commit_ids)
- # commit_ids = commit_ids[:500]
- log.info('Refreshing %d commits', len(commit_ids))
-
- # Refresh commits (repo-specific)
- seen = set()
- for i, oid in enumerate(commit_ids):
- c.app.repo.refresh_commit_info(oid, seen)
- if (i+1) % 100 == 0:
- log.info('Refresh commit info %d: %s', (i+1), oid)
-
- #############################################
- # Everything below here is repo-agnostic
- #############################################
-
- refresh_repo(commit_ids, c.app.repo)
-
- # Refresh child references
- seen = set()
- parents = set()
-
- for i, oid in enumerate(commit_ids):
- ci = CommitDoc.m.find(dict(_id=oid), validate=False).next()
- refresh_children(ci)
- seen.add(ci._id)
- parents.update(ci.parent_ids)
- if (i+1) % 100 == 0:
- log.info('Refresh child (a) info %d: %s', (i+1), ci._id)
- for j, oid in enumerate(parents-seen):
- try:
- ci = CommitDoc.m.find(dict(_id=oid), validate=False).next()
- except StopIteration:
- continue
- refresh_children(ci)
- if (i + j + 1) % 100 == 0:
- log.info('Refresh child (b) info %d: %s', (i + j + 1), ci._id)
-
- # Refresh commit runs
- rb = CommitRunBuilder(commit_ids)
- rb.run()
- rb.cleanup()
-
- # Refresh trees
- cache = {}
- for i, oid in enumerate(commit_ids):
- ci = CommitDoc.m.find(dict(_id=oid), validate=False).next()
- cache = refresh_commit_trees(ci, cache)
- if (i+1) % 100 == 0:
- log.info('Refresh commit trees %d: %s', (i+1), ci._id)
-
- # Compute diffs
- cache = {}
- for i, oid in enumerate(commit_ids):
- ci = CommitDoc.m.find(dict(_id=oid), validate=False).next()
- compute_diffs(c.app.repo._id, cache, ci)
- if (i+1) % 100 == 0:
- log.info('Compute diffs %d: %s', (i+1), ci._id)
-
-def refresh_commit_trees(ci, cache):
- trees_doc = TreesDoc(dict(
- _id=ci._id,
- tree_ids = list(trees(ci.tree_id, cache))))
- trees_doc.m.save(safe=False)
- new_cache = dict(
- (oid, cache[oid])
- for oid in trees_doc.tree_ids)
- return new_cache
-
-def refresh_commit_info(ci, seen):
- if CommitDoc.m.find(dict(_id=ci.hexsha)).count() != 0:
- return False
- try:
- ci_doc = CommitDoc(dict(
- _id=ci.hexsha,
- tree_id=ci.tree.hexsha,
- committed = Object(
- name=h.really_unicode(ci.committer.name),
- email=h.really_unicode(ci.committer.email),
- date=datetime.utcfromtimestamp(
- ci.committed_date-ci.committer_tz_offset)),
- authored = Object(
- name=h.really_unicode(ci.author.name),
- email=h.really_unicode(ci.author.email),
- date=datetime.utcfromtimestamp(
- ci.authored_date-ci.author_tz_offset)),
- message=h.really_unicode(ci.message or ''),
- child_ids=[],
- parent_ids = [ p.hexsha for p in ci.parents ]))
- ci_doc.m.insert(safe=True)
- except DuplicateKeyError:
- return False
- refresh_tree(ci.tree, seen)
- return True
-
-def refresh_repo(commit_ids, repo):
- for oids in utils.chunked_iter(commit_ids, QSIZE):
- oids = list(oids)
- # Create shortlinks and artifactrefs
- for oid in oids:
- index_id = 'allura.model.repo.Commit#' + oid
- ref = ArtifactReferenceDoc(dict(
- _id=index_id,
- artifact_reference=dict(
- cls=dumps(Commit),
- project_id=repo.app.config.project_id,
- app_config_id=repo.app.config._id,
- artifact_id=oid),
- references=[]))
- link = ShortlinkDoc(dict(
- _id=bson.ObjectId(),
- ref_id=index_id,
- project_id=repo.app.config.project_id,
- app_config_id=repo.app.config._id,
- link=repo.shorthand_for_commit(oid),
- url=repo.url() + 'ci/' + oid + '/'))
- ref.m.save(safe=False, validate=False)
- link.m.save(safe=False, validate=False)
- CommitDoc.m.update_partial(
- dict(
- _id={'$in': oids},
- repo_ids={'$ne': repo._id}),
- {'$addToSet': dict(repo_ids=repo._id)},
- multi=True)
-
-def refresh_children(ci):
- CommitDoc.m.update_partial(
- dict(_id={'$in': ci.parent_ids}),
- {'$addToSet': dict(child_ids=ci._id)},
- multi=True)
-
-class CommitRunBuilder(object):
-
- def __init__(self, commit_ids):
- self.commit_ids = commit_ids
- self.run_index = {} # by commit ID
- self.runs = {} # by run ID
- self.reasons = {} # reasons to stop merging runs
-
- def run(self):
- for oids in utils.chunked_iter(self.commit_ids, QSIZE):
- oids = list(oids)
- commits = list(CommitDoc.m.find(dict(_id={'$in':oids})))
- for ci in commits:
- if ci._id in self.run_index: continue
- self.run_index[ci._id] = ci._id
- self.runs[ci._id] = CommitRunDoc(dict(
- _id=ci._id,
- parent_commit_ids=ci.parent_ids,
- commit_ids=[ci._id],
- commit_times=[ci.authored.date]))
- self.merge_runs()
- log.info('%d runs', len(self.runs))
- for rid, run in sorted(self.runs.items()):
- log.info('%32s: %r', self.reasons.get(rid, 'none'), run._id)
- for run in self.runs.itervalues():
- run.m.save()
- return self.runs
-
- def _all_runs(self):
- runs = {}
- for oids in utils.chunked_iter(self.commit_ids, QSIZE):
- oids = list(oids)
- for run in CommitRunDoc.m.find(dict(commit_ids={'$in': oids})):
- runs[run._id] = run
- seen_run_ids = set()
- runs = runs.values()
- while runs:
- run = runs.pop()
- if run._id in seen_run_ids: continue
- seen_run_ids.add(run._id)
- yield run
- for run in CommitRunDoc.m.find(
- dict(commit_ids={'$in':run.parent_commit_ids})):
- runs.append(run)
-
- def cleanup(self):
- '''Delete non-maximal runs'''
- for run1 in self._all_runs():
- for run2 in CommitRunDoc.m.find(dict(
- commit_ids=run1.commit_ids[0])):
- if run1._id == run2._id: continue
- log.info('... delete %r (part of %r)', run2, run1)
- run2.m.delete()
-
- def merge_runs(self):
- while True:
- for run_id, run in self.runs.iteritems():
- if len(run.parent_commit_ids) != 1:
- self.reasons[run_id] = '%d parents' % len(run.parent_commit_ids)
- continue
- p_oid = run.parent_commit_ids[0]
- p_run_id = self.run_index.get(p_oid)
- if p_run_id is None:
- self.reasons[run_id] = 'parent commit not found'
- continue
- p_run = self.runs.get(p_run_id)
- if p_run is None:
- self.reasons[run_id] = 'parent run not found'
- continue
- if p_run.commit_ids[0] != p_oid:
- self.reasons[run_id] = 'parent does not start with parent commit'
- continue
- run.commit_ids += p_run.commit_ids
- run.commit_times += p_run.commit_times
- run.parent_commit_ids = p_run.parent_commit_ids
- for oid in p_run.commit_ids:
- self.run_index[oid] = run_id
- break
- else:
- break
- del self.runs[p_run_id]
-
-def refresh_tree(t, seen):
- if t.binsha in seen: return
- seen.add(t.binsha)
- doc = TreeDoc(dict(
- _id=t.hexsha,
- tree_ids=[],
- blob_ids=[],
- other_ids=[]))
- for o in t:
- obj = Object(
- name=h.really_unicode(o.name),
- id=o.hexsha)
- if o.type == 'tree':
- refresh_tree(o, seen)
- doc.tree_ids.append(obj)
- elif o.type == 'blob':
- doc.blob_ids.append(obj)
- else:
- obj.type = o.type
- doc.other_ids.append(obj)
- doc.m.save(safe=False)
-
-def trees(id, cache):
- yield id
- entries = cache.get(id, None)
- if entries is None:
- t = TreeDoc.m.get(_id=id)
- entries = [ o.id for o in t.tree_ids ]
- cache[id] = entries
- for i in entries:
- for x in trees(i, cache):
- yield x
-
-def unknown_commit_ids(all_commit_ids):
- result = []
- for chunk in utils.chunked_iter(all_commit_ids, QSIZE):
- q = CommitDoc.m.find(_id={'$in':chunk})
- known_commit_ids = set(ci._id for ci in q)
- result += [ oid for oid in chunk if oid not in known_commit_ids ]
- return result
-
-def compute_diffs(repo_id, tree_cache, rhs_ci):
- def _walk_tree(tree, tree_index):
- for x in tree.blob_ids: yield x.id
- for x in tree.other_ids: yield x.id
- for x in tree.tree_ids:
- yield x.id
- for xx in _walk_tree(tree_index[x.id], tree_index):
- yield xx
-
- rhs_tree_ids = TreesDoc.m.get(_id=rhs_ci._id).tree_ids
- if rhs_ci.parent_ids:
- lhs_ci = CommitDoc.m.get(_id=rhs_ci.parent_ids[0])
- else:
- lhs_ci = None
- if lhs_ci is not None:
- lhs_tree_ids = TreesDoc.m.get(_id=lhs_ci._id).tree_ids
- else:
- lhs_tree_ids = []
- new_tree_ids = [
- tid for tid in chain(lhs_tree_ids, rhs_tree_ids)
- if tid not in tree_cache ]
- tree_index = dict(
- (t._id, t) for t in TreeDoc.m.find(dict(_id={'$in': new_tree_ids}),validate=False))
- tree_index.update(tree_cache)
- rhs_tree_ids_set = set(rhs_tree_ids)
- tree_cache.clear()
- tree_cache.update(
- (id, t) for id,t in tree_index.iteritems() if id in rhs_tree_ids_set)
- rhs_tree = tree_index[rhs_ci.tree_id]
- if lhs_ci is None:
- lhs_tree = Object(_id=None, tree_ids=[], blob_ids=[], other_ids=[])
- else:
- lhs_tree = tree_index[lhs_ci.tree_id]
- differences = []
- for name, lhs_id, rhs_id in _diff_trees(lhs_tree, rhs_tree, tree_index):
- differences.append(
- dict(name=name, lhs_id=lhs_id, rhs_id=rhs_id))
- # Set last commit info
- if rhs_id is not None:
- _set_last_commit(repo_id, rhs_id, rhs_ci)
- rhs_tree = tree_index.get(rhs_id, None)
- if rhs_tree is not None:
- for oid in _walk_tree(rhs_tree, tree_index):
- _set_last_commit(repo_id, oid, rhs_ci)
- di = DiffInfoDoc(dict(
- _id=rhs_ci._id,
- differences=differences))
- di.m.save()
- return tree_cache
-
-def _diff_trees(lhs, rhs, index, *path):
- def _fq(name):
- return '/'.join(reversed(
- (name,) + path))
- # Diff the trees
- rhs_tree_ids = dict(
- (o.name, o.id)
- for o in rhs.tree_ids)
- for o in lhs.tree_ids:
- rhs_id = rhs_tree_ids.pop(o.name, None)
- if rhs_id == o.id:
- continue # no change
- elif rhs_id is None:
- yield (_fq(o.name), o.id, None)
- else:
- for difference in _diff_trees(
- index[o.id], index[rhs_id], index,
- o.name, *path):
- yield difference
- for name, id in rhs_tree_ids.items():
- yield (_fq(name), None, id)
- # DIff the blobs
- rhs_blob_ids = dict(
- (o.name, o.id)
- for o in rhs.blob_ids)
- for o in lhs.blob_ids:
- rhs_id = rhs_blob_ids.pop(o.name, None)
- if rhs_id == o.id:
- continue # no change
- elif rhs_id is None:
- yield (_fq(o.name), o.id, None)
- else:
- yield (_fq(o.name), o.id, rhs_id)
- for name, id in rhs_blob_ids.items():
- yield (_fq(name), None, id)
-
-def _set_last_commit(repo_id, oid, commit):
- lc = LastCommitDoc(dict(
- _id='%s:%s' % (repo_id, oid),
- repo_id=repo_id,
- object_id=oid,
- commit_info=dict(
- id=commit._id,
- author=commit.authored.name,
- author_email=commit.authored.email,
- date=commit.authored.date,
- # author_url=commit.author_url,
- # href=commit.url(),
- # shortlink=commit.shorthand_id(),
- # summary=commit.summary
- )))
- lc.m.save(safe=False)
- return lc
if __name__ == '__main__':
main()