opensourceprojects.eu / osp-allura / [b67150] /Allura/allura/model/repo.py

[b67150]: Allura / allura / model / repo.py History

repo.py 961 lines (848 with data), 33.9 kB

import os
import re
import sys
import logging
from hashlib import sha1
from itertools import chain
from datetime import datetime
from collections import defaultdict, OrderedDict
from difflib import SequenceMatcher, unified_diff
import bson

from pylons import c
import pymongo.errors

from ming import Field, collection, Index
from ming import schema as S
from ming.base import Object
from ming.utils import LazyProperty
from ming.orm import mapper, session

from allura.lib import utils
from allura.lib import helpers as h

from .auth import User
from .session import main_doc_session, project_doc_session
from .session import repository_orm_session

log = logging.getLogger(__name__)

# Some schema types
SUser = dict(name=str, email=str, date=datetime)
SObjType=S.OneOf('blob', 'tree', 'submodule')

# Used for when we're going to batch queries using $in
QSIZE = 100
README_RE = re.compile('^README(\.[^.]*)?$', re.IGNORECASE)
VIEWABLE_EXTENSIONS = ['.php','.py','.js','.java','.html','.htm','.yaml','.sh',
    '.rb','.phtml','.txt','.bat','.ps1','.xhtml','.css','.cfm','.jsp','.jspx',
    '.pl','.php4','.php3','.rhtml','.svg','.markdown','.json','.ini','.tcl','.vbs','.xsl']

DIFF_SIMILARITY_THRESHOLD = .5  # used for determining file renames

# Basic commit information
# One of these for each commit in the physical repo on disk. The _id is the
# hexsha of the commit (for Git and Hg).
CommitDoc = collection(
    'repo_ci', main_doc_session,
    Field('_id', str),
    Field('tree_id', str),
    Field('committed', SUser),
    Field('authored', SUser),
    Field('message', str),
    Field('parent_ids', [str], index=True),
    Field('child_ids', [str], index=True),
    Field('repo_ids', [ S.ObjectId() ], index=True))

# Basic tree information (also see TreesDoc)
TreeDoc = collection(
    'repo_tree', main_doc_session,
    Field('_id', str),
    Field('tree_ids', [dict(name=str, id=str)]),
    Field('blob_ids', [dict(name=str, id=str)]),
    Field('other_ids', [dict(name=str, id=str, type=SObjType)]))

LastCommitDoc_old = collection(
    'repo_last_commit', project_doc_session,
    Field('_id', str),
    Field('object_id', str, index=True),
    Field('name', str),
    Field('commit_info', dict(
        id=str,
        date=datetime,
        author=str,
        author_email=str,
        author_url=str,
        shortlink=str,
        summary=str)))

# Information about the last commit to touch a tree
LastCommitDoc = collection(
    'repo_last_commit', main_doc_session,
    Field('_id', S.ObjectId()),
    Field('commit_id', str),
    Field('path', str),
    Index('commit_id', 'path'),
    Field('entries', [dict(
        name=str,
        commit_id=str)]))

# List of all trees contained within a commit
# TreesDoc._id = CommitDoc._id
# TreesDoc.tree_ids = [ TreeDoc._id, ... ]
TreesDoc = collection(
    'repo_trees', main_doc_session,
    Field('_id', str),
    Field('tree_ids', [str]))

# Information about which things were added/removed in  commit
# DiffInfoDoc._id = CommitDoc._id
DiffInfoDoc = collection(
    'repo_diffinfo', main_doc_session,
    Field('_id', str),
    Field(
        'differences',
        [ dict(name=str, lhs_id=str, rhs_id=str)]))

# List of commit runs (a run is a linear series of single-parent commits)
# CommitRunDoc.commit_ids = [ CommitDoc._id, ... ]
CommitRunDoc = collection(
    'repo_commitrun', main_doc_session,
    Field('_id', str),
    Field('parent_commit_ids', [str], index=True),
    Field('commit_ids', [str], index=True),
    Field('commit_times', [datetime]))

class RepoObject(object):

    def __repr__(self): # pragma no cover
        return '<%s %s>' % (
            self.__class__.__name__, self._id)

    def primary(self):
        return self

    def index_id(self):
        '''Globally unique artifact identifier.  Used for
        SOLR ID, shortlinks, and maybe elsewhere
        '''
        id = '%s.%s#%s' % (
            self.__class__.__module__,
            self.__class__.__name__,
            self._id)
        return id.replace('.', '/')

    @classmethod
    def upsert(cls, id, **kwargs):
        isnew = False
        r = cls.query.get(_id=id)
        if r is not None: return r, isnew
        try:
            r = cls(_id=id, **kwargs)
            session(r).flush(r)
            isnew = True
        except pymongo.errors.DuplicateKeyError: # pragma no cover
            session(r).expunge(r)
            r = cls.query.get(_id=id)
        return r, isnew

class Commit(RepoObject):
    type_s = 'Commit'
    # Ephemeral attrs
    repo=None

    def set_context(self, repo):
        self.repo = repo

    @LazyProperty
    def author_url(self):
        u = User.by_email_address(self.authored.email)
        if u: return u.url()

    @LazyProperty
    def committer_url(self):
        u = User.by_email_address(self.committed.email)
        if u: return u.url()

    @LazyProperty
    def tree(self):
        if self.tree_id is None:
            self.tree_id = self.repo.compute_tree_new(self)
        if self.tree_id is None:
            return None
        cache = getattr(c, 'model_cache', '') or ModelCache()
        t = cache.get(Tree, dict(_id=self.tree_id))
        if t is None:
            self.tree_id = self.repo.compute_tree_new(self)
            t = Tree.query.get(_id=self.tree_id)
        if t is not None: t.set_context(self)
        return t

    @LazyProperty
    def summary(self):
        message = h.really_unicode(self.message)
        first_line = message.split('\n')[0]
        return h.text.truncate(first_line, 50)

    def shorthand_id(self):
        if self.repo is None: self.repo = self.guess_repo()
        if self.repo is None: return repr(self)
        return self.repo.shorthand_for_commit(self._id)

    @LazyProperty
    def symbolic_ids(self):
        return self.repo.symbolics_for_commit(self)

    def get_parent(self, index=0):
        '''Get the parent of this commit.

        If there is no parent commit, or if an invalid index is given,
        returns None.
        '''
        try:
            cache = getattr(c, 'model_cache', '') or ModelCache()
            ci = cache.get(Commit, dict(_id=self.parent_ids[index]))
            if not ci:
                return None
            ci.set_context(self.repo)
            return ci
        except IndexError as e:
            return None

    def climb_commit_tree(self, predicate=None):
        '''
        Returns a generator that walks up the commit tree along
        the first-parent ancestory, starting with this commit,
        optionally filtering by a predicate.'''
        ancestor = self
        while ancestor:
            if predicate is None or predicate(ancestor):
                yield ancestor
            ancestor = ancestor.get_parent()

    def url(self):
        if self.repo is None: self.repo = self.guess_repo()
        if self.repo is None: return '#'
        return self.repo.url_for_commit(self)

    def guess_repo(self):
        for ac in c.project.app_configs:
            try:
                app = c.project.app_instance(ac)
                if app.repo._id in self.repo_ids:
                    return app.repo
            except AttributeError:
                pass
        return None

    def link_text(self):
        '''The link text that will be used when a shortlink to this artifact
        is expanded into an <a></a> tag.

        By default this method returns shorthand_id(). Subclasses should
        override this method to provide more descriptive link text.
        '''
        return self.shorthand_id()

    def context(self):
        result = dict(prev=None, next=None)
        if self.parent_ids:
            result['prev'] = self.query.find(dict(_id={'$in': self.parent_ids })).all()
            for ci in result['prev']:
                ci.set_context(self.repo)
        if self.child_ids:
            result['next'] = self.query.find(dict(_id={'$in': self.child_ids })).all()
            for ci in result['next']:
                ci.set_context(self.repo)
        return result

    @LazyProperty
    def diffs(self):
        di = DiffInfoDoc.m.get(_id=self._id)
        if di is None:
            return Object(added=[], removed=[], changed=[], copied=[])
        added = []
        removed = []
        changed = []
        copied = []
        for change in di.differences:
            if change.rhs_id is None:
                removed.append(change.name)
            elif change.lhs_id is None:
                added.append(change.name)
            else:
                changed.append(change.name)
        copied = self._diffs_copied(added, removed)
        return Object(
            added=added, removed=removed,
            changed=changed, copied=copied)

    def _diffs_copied(self, added, removed):
        '''Return list with file renames diffs.

        Will change `added` and `removed` lists also.
        '''
        def _blobs_similarity(removed_blob, added):
            best = dict(ratio=0, name='', blob=None)
            for added_name in added:
                added_blob = self.tree.get_obj_by_path(added_name)
                if not isinstance(added_blob, Blob):
                    continue
                diff = SequenceMatcher(None, removed_blob.text,
                                       added_blob.text)
                ratio = diff.quick_ratio()
                if ratio > best['ratio']:
                    best['ratio'] = ratio
                    best['name'] = added_name
                    best['blob'] = added_blob

                if ratio == 1:
                    break  # we'll won't find better similarity than 100% :)

            if best['ratio'] > DIFF_SIMILARITY_THRESHOLD:
                diff = ''
                if best['ratio'] < 1:
                    added_blob = best['blob']
                    rpath = ('a' + removed_blob.path()).encode('utf-8')
                    apath = ('b' + added_blob.path()).encode('utf-8')
                    diff = ''.join(unified_diff(list(removed_blob),
                                                list(added_blob),
                                                rpath, apath))
                return dict(new=best['name'],
                            ratio=best['ratio'], diff=diff)

        def _trees_similarity(removed_tree, added):
            for added_name in added:
                added_tree = self.tree.get_obj_by_path(added_name)
                if not isinstance(added_tree, Tree):
                    continue
                if removed_tree._id == added_tree._id:
                    return dict(new=added_name,
                                ratio=1, diff='')

        if not removed:
            return []
        copied = []
        prev_commit = self.get_parent()
        for removed_name in removed[:]:
            removed_blob = prev_commit.tree.get_obj_by_path(removed_name)
            rename_info = None
            if isinstance(removed_blob, Blob):
                rename_info = _blobs_similarity(removed_blob, added)
            elif isinstance(removed_blob, Tree):
                rename_info = _trees_similarity(removed_blob, added)
            if rename_info is not None:
                rename_info['old'] = removed_name
                copied.append(rename_info)
                removed.remove(rename_info['old'])
                added.remove(rename_info['new'])
        return copied

    def get_path(self, path):
        path = path.lstrip('/')
        parts = path.split('/')
        cur = self.tree
        for part in parts:
            if part != '':
                cur = cur[part]
        return cur

    @LazyProperty
    def changed_paths(self):
        '''
        Returns a list of paths changed in this commit.
        Leading and trailing slashes are removed, and
        the list is complete, meaning that if a sub-path
        is changed, all of the parent paths are included
        (including '' to represent the root path).

        Example:

            If the file /foo/bar is changed in the commit,
            this would return ['', 'foo', 'foo/bar']
        '''
        diff_info = DiffInfoDoc.m.get(_id=self._id)
        diffs = set()
        if diff_info:
            for d in diff_info.differences:
                node = d.name.strip('/')
                diffs.add(node)
                node_path = os.path.dirname(node)
                while node_path:
                    diffs.add(node_path)
                    node_path = os.path.dirname(node_path)
                diffs.add('')  # include '/' if there are any changes
        return diffs

    @LazyProperty
    def info(self):
        return dict(
            id=self._id,
            author=self.authored.name,
            author_email=self.authored.email,
            date=self.authored.date,
            author_url=self.author_url,
            shortlink=self.shorthand_id(),
            summary=self.summary
            )

class Tree(RepoObject):
    # Ephemeral attrs
    repo=None
    commit=None
    parent=None
    name=None

    def compute_hash(self):
        '''Compute a hash based on the contents of the tree.  Note that this
        hash does not necessarily correspond to any actual DVCS hash.
        '''
        lines = (
            [ 'tree' + x.name + x.id for x in self.tree_ids ]
            + [ 'blob' + x.name + x.id for x in self.blob_ids ]
            + [ x.type + x.name + x.id for x in self.other_ids ])
        sha_obj = sha1()
        for line in sorted(lines):
            sha_obj.update(line)
        return sha_obj.hexdigest()

    def __getitem__(self, name):
        cache = getattr(c, 'model_cache', '') or ModelCache()
        obj = self.by_name[name]
        if obj['type'] == 'blob':
            return Blob(self, name, obj['id'])
        obj = cache.get(Tree, dict(_id=obj['id']))
        if obj is None:
            oid = self.repo.compute_tree_new(self.commit, self.path() + name + '/')
            obj = cache.get(Tree, dict(_id=oid))
        if obj is None: raise KeyError, name
        obj.set_context(self, name)
        return obj

    def get_obj_by_path(self, path):
        if hasattr(path, 'get'):
            path = path['new']
        if path.startswith('/'):
            path = path[1:]
        path = path.split('/')
        obj = self
        for p in path:
            try:
                obj = obj[p]
            except KeyError:
                return None
        return obj

    def get_blob_by_path(self, path):
        obj = self.get_obj_by_path(path)
        return obj if isinstance(obj, Blob) else None

    def set_context(self, commit_or_tree, name=None):
        assert commit_or_tree is not self
        self.repo = commit_or_tree.repo
        if name:
            self.commit = commit_or_tree.commit
            self.parent = commit_or_tree
            self.name = name
        else:
            self.commit = commit_or_tree

    def readme(self):
        'returns (filename, unicode text) if a readme file is found'
        for x in self.blob_ids:
            if README_RE.match(x.name):
                name = x.name
                blob = self[name]
                return (x.name, h.really_unicode(blob.text))
        return None, None

    def ls(self):
        '''
        List the entries in this tree, with historical commit info for
        each node.  Eventually, ls_old can be removed and this can be
        replaced with the following:

            return self._lcd_map(LastCommit.get(self))
        '''
        # look for existing new format first
        last_commit = LastCommit.get(self, create=False)
        if last_commit:
            return self._lcd_map(last_commit)
        # otherwise, try old format
        old_style_results = self.ls_old()
        if old_style_results:
            return old_style_results
        # finally, use the new implentation that auto-vivifies
        last_commit = LastCommit.get(self, create=True)
        # ensure that the LCD is saved, even if
        # there is an error later in the request
        session(last_commit).flush(last_commit)
        return self._lcd_map(LastCommit.get(self))

    def _lcd_map(self, lcd):
        if lcd is None:
            return []
        commit_ids = [e.commit_id for e in lcd.entries]
        commits = Commit.query.find(dict(_id={'$in': commit_ids}))
        commit_infos = {c._id: c.info for c in commits}
        by_name = lambda n: n.name
        tree_names = sorted([n.name for n in self.tree_ids])
        blob_names = sorted([n.name for n in chain(self.blob_ids, self.other_ids)])

        results = []
        for type, names in (('DIR', tree_names), ('BLOB', blob_names)):
            for name in names:
                commit_info = commit_infos[lcd.by_name[name]]
                results.append(dict(
                        kind=type,
                        name=name,
                        href=name,
                        last_commit=dict(
                                author=commit_info['author'],
                                author_email=commit_info['author_email'],
                                author_url=commit_info['author_url'],
                                date=commit_info['date'],
                                href=self.repo.url_for_commit(commit_info['id']),
                                shortlink=commit_info['shortlink'],
                                summary=commit_info['summary'],
                            ),
                    ))
        return results

    def ls_old(self):
        # Load last commit info
        id_re = re.compile("^{0}:{1}:".format(
            self.repo._id,
            re.escape(h.really_unicode(self.path()).encode('utf-8'))))
        lc_index = dict(
            (lc.name, lc.commit_info)
            for lc in LastCommitDoc_old.m.find(dict(_id=id_re)))

        # FIXME: Temporarily fall back to old, semi-broken lookup behavior until refresh is done
        oids = [ x.id for x in chain(self.tree_ids, self.blob_ids, self.other_ids) ]
        id_re = re.compile("^{0}:".format(self.repo._id))
        lc_index.update(dict(
            (lc.object_id, lc.commit_info)
            for lc in LastCommitDoc_old.m.find(dict(_id=id_re, object_id={'$in': oids}))))
        # /FIXME

        if not lc_index:
            # allow fallback to new method instead
            # of showing a bunch of Nones
            return []

        results = []
        def _get_last_commit(name, oid):
            lc = lc_index.get(name, lc_index.get(oid, None))
            if lc is None:
                lc = dict(
                    author=None,
                    author_email=None,
                    author_url=None,
                    date=None,
                    id=None,
                    href=None,
                    shortlink=None,
                    summary=None)
            if 'href' not in lc:
                lc['href'] = self.repo.url_for_commit(lc['id'])
            return lc
        for x in sorted(self.tree_ids, key=lambda x:x.name):
            results.append(dict(
                    kind='DIR',
                    name=x.name,
                    href=x.name + '/',
                    last_commit=_get_last_commit(x.name, x.id)))
        for x in sorted(self.blob_ids, key=lambda x:x.name):
            results.append(dict(
                    kind='FILE',
                    name=x.name,
                    href=x.name,
                    last_commit=_get_last_commit(x.name, x.id)))
        for x in sorted(self.other_ids, key=lambda x:x.name):
            results.append(dict(
                    kind=x.type,
                    name=x.name,
                    href=None,
                    last_commit=_get_last_commit(x.name, x.id)))
        return results

    def path(self):
        if self.parent:
            assert self.parent is not self
            return self.parent.path() + self.name + '/'
        else:
            return '/'

    def url(self):
        return self.commit.url() + 'tree' + self.path()

    @LazyProperty
    def by_name(self):
        d = Object((x.name, x) for x in self.other_ids)
        d.update(
            (x.name, Object(x, type='tree'))
            for x in self.tree_ids)
        d.update(
            (x.name, Object(x, type='blob'))
            for x in self.blob_ids)
        return d

    def is_blob(self, name):
        return self.by_name[name]['type'] == 'blob'

    def get_blob(self, name):
        x = self.by_name[name]
        return Blob(self, name, x.id)

class Blob(object):
    '''Lightweight object representing a file in the repo'''

    def __init__(self, tree, name, _id):
        self._id = _id
        self.tree = tree
        self.name = name
        self.repo = tree.repo
        self.commit = tree.commit
        fn, ext = os.path.splitext(self.name)
        self.extension = ext or fn

    def path(self):
        return self.tree.path() + h.really_unicode(self.name)

    def url(self):
        return self.tree.url() + h.really_unicode(self.name)

    @LazyProperty
    def prev_commit(self):
        lc = LastCommit.get(self.tree)
        if lc:
            last_commit = self.repo.commit(lc.by_name[self.name])
            prev_commit = last_commit.get_parent()
            try:
                tree = prev_commit and prev_commit.get_path(self.tree.path().rstrip('/'))
            except KeyError:
                return None  # parent tree added this commit
            if not tree or self.name not in tree.by_name:
                return None  # tree or file added this commit
            lc = LastCommit.get(tree)
            commit_id = lc and lc.by_name.get(self.name)
            if commit_id:
                prev_commit = self.repo.commit(commit_id)
                return prev_commit
        return None

    @LazyProperty
    def next_commit(self):
        try:
            path = self.path()
            cur = self.commit
            next = cur.context()['next']
            while next:
                cur = next[0]
                next = cur.context()['next']
                other_blob = cur.get_path(path)
                if other_blob is None or other_blob._id != self._id:
                    return cur
        except:
            log.exception('Lookup prev_commit')
            return None

    @LazyProperty
    def _content_type_encoding(self):
        return self.repo.guess_type(self.name)

    @LazyProperty
    def content_type(self):
        return self._content_type_encoding[0]

    @LazyProperty
    def content_encoding(self):
        return self._content_type_encoding[1]

    @property
    def has_pypeline_view(self):
        if README_RE.match(self.name) or self.extension in ['.md', '.rst']:
            return True
        return False

    @property
    def has_html_view(self):
        if (self.content_type.startswith('text/') or
            self.extension in VIEWABLE_EXTENSIONS or
            self.extension in self.repo._additional_viewable_extensions or
            utils.is_text_file(self.text)):
            return True
        return False

    @property
    def has_image_view(self):
        return self.content_type.startswith('image/')

    def context(self):
        path = self.path()
        prev = self.prev_commit
        next = self.next_commit
        if prev is not None: prev = prev.get_path(path)
        if next is not None: next = next.get_path(path)
        return dict(
            prev=prev,
            next=next)

    def open(self):
        return self.repo.open_blob(self)

    def __iter__(self):
        return iter(self.open())

    @LazyProperty
    def size(self):
        return self.repo.blob_size(self)

    @LazyProperty
    def text(self):
        return self.open().read()

    @classmethod
    def diff(cls, v0, v1):
        differ = SequenceMatcher(v0, v1)
        return differ.get_opcodes()

class LastCommit(RepoObject):
    def __repr__(self):
        return '<LastCommit /%s %s>' % (self.path, self.commit_id)

    @classmethod
    def _last_commit_id(cls, commit, path):
        commit_id = list(commit.repo.commits(path, commit._id, limit=1))
        if commit_id:
            commit_id = commit_id[0]
        else:
            log.error('Tree node not recognized by SCM: %s @ %s', path, commit._id)
            commit_id = commit._id
        return commit_id

    @classmethod
    def get(cls, tree, create=True):
        '''Find or build the LastCommitDoc for the given tree.'''
        cache = getattr(c, 'model_cache', '') or ModelCache()
        path = tree.path().strip('/')
        last_commit_id = cls._last_commit_id(tree.commit, path)
        lcd = cache.get(cls, {'path': path, 'commit_id': last_commit_id})
        if lcd is None and create:
            commit = cache.get(Commit, {'_id': last_commit_id})
            commit.set_context(tree.repo)
            lcd = cls._build(commit.get_path(path))
            cache.set(cls, {'path': path, 'commit_id': last_commit_id}, lcd)
        return lcd

    @classmethod
    def _build(cls, tree):
        '''
          Build the LCD record, presuming that this tree is where it was most
          recently changed.
        '''
        path = tree.path().strip('/')
        entries = []
        prev_lcd = None
        parent = tree.commit.get_parent()
        if parent:
            try:
                prev_lcd = cls.get(parent.get_path(path), create=False)
            except KeyError as e:
                prev_lcd = None  # will fail if path was added this commit
        entries = {}
        nodes = set([node.name for node in chain(tree.tree_ids, tree.blob_ids, tree.other_ids)])
        changed = set([node for node in nodes if os.path.join(path, node) in tree.commit.changed_paths])
        if prev_lcd:
            # get unchanged entries from previously computed LCD
            entries = prev_lcd.by_name
        else:
            # no previously computed LCD, so get unchanged entries from SCM
            # (but only ask for the ones that we know we need)
            unchanged = [os.path.join(path, node) for node in nodes - changed]
            entries = tree.commit.repo.last_commit_ids(tree.commit, unchanged)
            if entries is None:
                # something strange went wrong; bail out and possibly try again later
                return None
            # paths are fully-qualified; shorten them back to just node names
            entries = {os.path.basename(path):commit_id for path,commit_id in entries.iteritems()}
        # update with the nodes changed in this tree's commit
        entries.update({node: tree.commit._id for node in changed})
        # convert to a list of dicts, since mongo doesn't handle arbitrary keys well (i.e., . and $ not allowed)
        entries = [{'name':name, 'commit_id':value} for name,value in entries.iteritems()]
        lcd = cls(
                commit_id=tree.commit._id,
                path=path,
                entries=entries,
            )
        return lcd

    @LazyProperty
    def by_name(self):
        return {n.name: n.commit_id for n in self.entries}

mapper(Commit, CommitDoc, repository_orm_session)
mapper(Tree, TreeDoc, repository_orm_session)
mapper(LastCommit, LastCommitDoc, repository_orm_session)


class ModelCache(object):
    '''
    Cache model instances based on query params passed to get.
    '''
    def __init__(self, max_instances=None, max_queries=None):
        '''
        By default, each model type can have 2000 instances and
        8000 queries.  You can override these for specific model
        types by passing in a dict() for either max_instances or
        max_queries keyed by the class(es) with the max values.
        Classes not in the dict() will use the default 2000/8000
        default.

        If you pass in a number instead of a dict, that value will
        be used as the max for all classes.
        '''
        max_instances_default = 2000
        max_queries_default = 8000
        if isinstance(max_instances, int):
            max_instances_default = max_instances
        if isinstance(max_queries, int):
            max_queries_default = max_queries
        self._max_instances = defaultdict(lambda:max_instances_default)
        self._max_queries = defaultdict(lambda:max_queries_default)
        if hasattr(max_instances, 'items'):
            self._max_instances.update(max_instances)
        if hasattr(max_queries, 'items'):
            self._max_queries.update(max_queries)

        self._query_cache = defaultdict(OrderedDict)  # keyed by query, holds _id
        self._instance_cache = defaultdict(OrderedDict)  # keyed by _id
        self._synthetic_ids = defaultdict(set)
        self._synthetic_id_queries = defaultdict(set)

    def _normalize_query(self, query):
        _query = query
        if not isinstance(_query, tuple):
            _query = tuple(sorted(_query.items(), key=lambda k: k[0]))
        return _query

    def _model_query(self, cls):
        if hasattr(cls, 'query'):
            return cls.query
        elif hasattr(cls, 'm'):
            return cls.m
        else:
            raise AttributeError('%s has neither "query" nor "m" attribute' % cls)

    def get(self, cls, query):
        _query = self._normalize_query(query)
        self._touch(cls, _query)
        if _query not in self._query_cache[cls]:
            val = self._model_query(cls).get(**query)
            self.set(cls, _query, val)
            return val
        _id = self._query_cache[cls][_query]
        if _id is None:
            return None
        if _id not in self._instance_cache[cls]:
            val = self._model_query(cls).get(**query)
            self.set(cls, _query, val)
            return val
        return self._instance_cache[cls][_id]

    def set(self, cls, query, val):
        _query = self._normalize_query(query)
        if val is not None:
            _id = getattr(val, '_model_cache_id',
                    getattr(val, '_id',
                        self._query_cache[cls].get(_query,
                            None)))
            if _id is None:
                _id = val._model_cache_id = bson.ObjectId()
                self._synthetic_ids[cls].add(_id)
            if _id in self._synthetic_ids:
                self._synthetic_id_queries[cls].add(_query)
            self._query_cache[cls][_query] = _id
            self._instance_cache[cls][_id] = val
        else:
            self._query_cache[cls][_query] = None
        self._touch(cls, _query)
        self._check_sizes(cls)

    def _touch(self, cls, query):
        '''
        Keep track of insertion order, prevent duplicates,
        and expire from the cache in a FIFO manner.
        '''
        _query = self._normalize_query(query)
        if _query not in self._query_cache[cls]:
            return
        _id = self._query_cache[cls].pop(_query)
        self._query_cache[cls][_query] = _id

        if _id not in self._instance_cache[cls]:
            return
        val = self._instance_cache[cls].pop(_id)
        self._instance_cache[cls][_id] = val

    def _check_sizes(self, cls):
        if self.num_queries(cls) > self._max_queries[cls]:
            _id = self._remove_least_recently_used(self._query_cache[cls])
            if _id in self._instance_cache[cls]:
                instance = self._instance_cache[cls][_id]
                self._try_flush(instance, expunge=False)
        if self.num_instances(cls) > self._max_instances[cls]:
            instance = self._remove_least_recently_used(self._instance_cache[cls])
            self._try_flush(instance, expunge=True)

    def _try_flush(self, instance, expunge=False):
        try:
            inst_session = session(instance)
        except AttributeError:
            inst_session = None
        if inst_session:
            inst_session.flush(instance)
            if expunge:
                inst_session.expunge(instance)

    def _remove_least_recently_used(self, cache):
        # last-used (most-recently-used) is last in cache, so take first
        key, val = cache.popitem(last=False)
        return val

    def expire_new_instances(self, cls):
        '''
        Expire any instances that were "new" or had no _id value.

        If a lot of new instances of a class are being created, it's possible
        for a query to pull a copy from mongo when a copy keyed by the synthetic
        ID is still in the cache, potentially causing de-sync between the copies
        leading to one with missing data overwriting the other.  Clear new
        instances out of the cache relatively frequently (depending on the query
        and instance cache sizes) to avoid this.
        '''
        for _query in self._synthetic_id_queries[cls]:
            self._query_cache[cls].pop(_query)
        self._synthetic_id_queries[cls] = set()
        for _id in self._synthetic_ids[cls]:
            instance = self._instance_cache[cls].pop(_id)
            self._try_flush(instance, expunge=True)
        self._synthetic_ids[cls] = set()

    def num_queries(self, cls=None):
        if cls is None:
            return sum([len(c) for c in self._query_cache.values()])
        else:
            return len(self._query_cache[cls])

    def num_instances(self, cls=None):
        if cls is None:
            return sum([len(c) for c in self._instance_cache.values()])
        else:
            return len(self._instance_cache[cls])

    def instance_ids(self, cls):
        return self._instance_cache[cls].keys()

    def batch_load(self, cls, query, attrs=None):
        '''
        Load multiple results given a query.

        Optionally takes a list of attribute names to use
        as the cache key.  If not given, uses the keys of
        the given query.
        '''
        if attrs is None:
            attrs = query.keys()
        for result in self._model_query(cls).find(query):
            keys = {a: getattr(result, a) for a in attrs}
            self.set(cls, keys, result)