"""
Generate Allura sitemap xml files.
This takes a while to run on a prod-sized data set. There are a couple of
things that would make it faster, if we need/want to.
1. Monkeypatch forgetracker.model.ticket.Globals.bin_count to skip the
refresh (Solr search) and just return zero for everything, since we don't
need bin counts for the sitemap.
2. Use multiprocessing to distribute the offsets to n subprocesses.
"""
import os, sys
from datetime import datetime
from jinja2 import Template
import pylons, webob
from pylons import c
from allura import model as M
from allura.lib import security
from ming.orm import session, ThreadLocalORMSession
PROJECTS_PER_FILE = 1000
BASE_URL = 'http://sourceforge.net'
INDEX_TEMPLATE = """\
<?xml version="1.0" encoding="utf-8"?>
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
{% for sitemap in sitemaps -%}
<sitemap>
<loc>{{ sitemap }}</loc>
<lastmod>{{ now }}</lastmod>
<changefreq>daily</changefreq>
</sitemap>
{%- endfor %}
</sitemapindex>
"""
SITEMAP_TEMPLATE = """\
<?xml version="1.0" encoding="utf-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
{% for loc in locs -%}
<url>
<loc>{{ loc }}</loc>
<lastmod>{{ now }}</lastmod>
<changefreq>daily</changefreq>
</url>
{% endfor %}
</urlset>
"""
def main(options, args):
# This script will indirectly call app.sidebar_menu() for every app in
# every project. Some of the sidebar_menu methods expect the
# pylons.request threadlocal object to be present. So, we're faking it.
#
# The fact that this isn't a 'real' request doesn't matter for the
# purposes of the sitemap.
pylons.request._push_object(webob.Request.blank('/'))
output_path = options.output_dir
if os.path.exists(output_path):
sys.exit('Error: %s directory already exists.' % output_path)
try:
os.mkdir(output_path)
except OSError, e:
sys.exit("Error: Couldn't create %s:\n%s" % (output_path, e))
# Count projects and create sitemap index file
num_projects = M.Project.query.find().count()
now = datetime.utcnow().date()
offsets = [i for i in range(0, num_projects, PROJECTS_PER_FILE)]
sitemap_index_vars = dict(
now=now,
sitemaps = [
'%s/allura_sitemap/sitemap-%d.xml' % (BASE_URL, offset)
for offset in offsets])
sitemap_index_content = Template(INDEX_TEMPLATE).render(sitemap_index_vars)
with open(os.path.join(output_path, 'sitemap.xml'), 'w') as f:
f.write(sitemap_index_content)
# Create urlset file for each chunk of PROJECTS_PER_FILE projects
sitemap_content_template = Template(SITEMAP_TEMPLATE)
creds = security.Credentials.get()
for offset in offsets:
locs = []
for p in M.Project.query.find().skip(offset).limit(PROJECTS_PER_FILE):
c.project = p
try:
locs += [BASE_URL + s.url for s in p.sitemap()]
except Exception, e:
print "Error creating sitemap for project '%s': %s" %\
(p.shortname, e)
creds.clear()
sitemap_vars = dict(now=now, locs=locs)
sitemap_content = sitemap_content_template.render(sitemap_vars)
with open(os.path.join(output_path, 'sitemap-%d.xml' % offset), 'w') as f:
f.write(sitemap_content)
session(p).clear()
ThreadLocalORMSession.close_all()
def parse_options():
from optparse import OptionParser
optparser = OptionParser(
usage='allurapaste script /var/local/config/production.ini '
'-- %prog [OPTIONS]')
optparser.add_option('-o', '--output-dir', dest='output_dir',
default='/tmp/allura_sitemap',
help='Output directory (absolute path).'
'Default is /tmp/allura_sitemap.')
options, args = optparser.parse_args()
return options, args
if __name__ == '__main__':
options, args = parse_options()
main(options, args)