Switch to side-by-side view

--- a/scripts/create-allura-sitemap.py
+++ b/scripts/create-allura-sitemap.py
@@ -19,10 +19,10 @@
 from pylons import c
 
 from allura import model as M
-from allura.lib import security
+from allura.lib import security, utils
 from ming.orm import session, ThreadLocalORMSession
 
-PROJECTS_PER_FILE = 1000
+MAX_SITEMAP_URLS = 50000
 BASE_URL = 'http://sourceforge.net'
 
 INDEX_TEMPLATE = """\
@@ -32,7 +32,6 @@
    <sitemap>
       <loc>{{ sitemap }}</loc>
       <lastmod>{{ now }}</lastmod>
-      <changefreq>daily</changefreq>
    </sitemap>
    {%- endfor %}
 </sitemapindex>
@@ -68,40 +67,53 @@
     except OSError, e:
         sys.exit("Error: Couldn't create %s:\n%s" % (output_path, e))
 
-    # Count projects and create sitemap index file
-    num_projects = M.Project.query.find().count()
     now = datetime.utcnow().date()
-    offsets = [i for i in range(0, num_projects, PROJECTS_PER_FILE)]
-    sitemap_index_vars = dict(
-        now=now,
-        sitemaps = [
-            '%s/allura_sitemap/sitemap-%d.xml' % (BASE_URL, offset)
-            for offset in offsets])
-    sitemap_index_content = Template(INDEX_TEMPLATE).render(sitemap_index_vars)
-    with open(os.path.join(output_path, 'sitemap.xml'), 'w') as f:
-        f.write(sitemap_index_content)
+    sitemap_content_template = Template(SITEMAP_TEMPLATE)
+    def write_sitemap(urls, file_no):
+        sitemap_content = sitemap_content_template.render(dict(
+            now=now, locs=urls))
+        with open(os.path.join(output_path, 'sitemap-%d.xml' % file_no), 'w') as f:
+            f.write(sitemap_content)
 
-    # Create urlset file for each chunk of PROJECTS_PER_FILE projects
-    sitemap_content_template = Template(SITEMAP_TEMPLATE)
     creds = security.Credentials.get()
-    for offset in offsets:
-        locs = []
-        for p in M.Project.query.find().skip(offset).limit(PROJECTS_PER_FILE):
+    locs = []
+    file_count = 0
+    # write sitemap files, MAX_SITEMAP_URLS per file
+    for chunk in utils.chunked_find(M.Project):
+        for p in chunk:
             c.project = p
             try:
-                locs += [BASE_URL + s.url for s in p.sitemap()]
+                locs += [BASE_URL + s.url if s.url[0] == '/' else s.url
+                         for s in p.sitemap()]
             except Exception, e:
                 print "Error creating sitemap for project '%s': %s" %\
-                      (p.shortname, e)
+                    (p.shortname, e)
             creds.clear()
-        sitemap_vars = dict(now=now, locs=locs)
-        sitemap_content = sitemap_content_template.render(sitemap_vars)
-        with open(os.path.join(output_path, 'sitemap-%d.xml' % offset), 'w') as f:
-            f.write(sitemap_content)
-        session(p).clear()
+            if len(locs) >= options.urls_per_file:
+                write_sitemap(locs[:options.urls_per_file], file_count)
+                del locs[:options.urls_per_file]
+                file_count += 1
+            session(p).clear()
         ThreadLocalORMSession.close_all()
+    while locs:
+        write_sitemap(locs[:options.urls_per_file], file_count)
+        del locs[:options.urls_per_file]
+        file_count += 1
+    # write sitemap index file
+    if file_count:
+        sitemap_index_vars = dict(
+            now=now,
+            sitemaps = [
+                '%s/allura_sitemap/sitemap-%d.xml' % (BASE_URL, n)
+                for n in range(file_count)])
+        sitemap_index_content = Template(INDEX_TEMPLATE).render(sitemap_index_vars)
+        with open(os.path.join(output_path, 'sitemap.xml'), 'w') as f:
+            f.write(sitemap_index_content)
 
 def parse_options():
+    def validate(option, opt_str, value, parser):
+        parser.values.urls_per_file = min(value, MAX_SITEMAP_URLS)
+
     from optparse import OptionParser
     optparser = OptionParser(
         usage='allurapaste script /var/local/config/production.ini '
@@ -109,7 +121,13 @@
     optparser.add_option('-o', '--output-dir', dest='output_dir',
                          default='/tmp/allura_sitemap',
                          help='Output directory (absolute path).'
-                              'Default is /tmp/allura_sitemap.')
+                              '[default: %default]')
+    optparser.add_option('-u', '--urls-per-file', dest='urls_per_file',
+                         default=10000, type='int',
+                         help='Number of URLs per sitemap file. '
+                         '[default: %default, max: ' +
+                         str(MAX_SITEMAP_URLS) + ']',
+                         action='callback', callback=validate)
     options, args = optparser.parse_args()
     return options, args