Switch to unified view

a/scripts/create-allura-sitemap.py b/scripts/create-allura-sitemap.py
...
...
17
17
18
import pylons, webob
18
import pylons, webob
19
from pylons import c
19
from pylons import c
20
20
21
from allura import model as M
21
from allura import model as M
22
from allura.lib import security
22
from allura.lib import security, utils
23
from ming.orm import session, ThreadLocalORMSession
23
from ming.orm import session, ThreadLocalORMSession
24
24
25
PROJECTS_PER_FILE = 1000
25
MAX_SITEMAP_URLS = 50000
26
BASE_URL = 'http://sourceforge.net'
26
BASE_URL = 'http://sourceforge.net'
27
27
28
INDEX_TEMPLATE = """\
28
INDEX_TEMPLATE = """\
29
<?xml version="1.0" encoding="utf-8"?>
29
<?xml version="1.0" encoding="utf-8"?>
30
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
30
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
31
   {% for sitemap in sitemaps -%}
31
   {% for sitemap in sitemaps -%}
32
   <sitemap>
32
   <sitemap>
33
      <loc>{{ sitemap }}</loc>
33
      <loc>{{ sitemap }}</loc>
34
      <lastmod>{{ now }}</lastmod>
34
      <lastmod>{{ now }}</lastmod>
35
      <changefreq>daily</changefreq>
36
   </sitemap>
35
   </sitemap>
37
   {%- endfor %}
36
   {%- endfor %}
38
</sitemapindex>
37
</sitemapindex>
39
"""
38
"""
40
39
...
...
66
    try:
65
    try:
67
        os.mkdir(output_path)
66
        os.mkdir(output_path)
68
    except OSError, e:
67
    except OSError, e:
69
        sys.exit("Error: Couldn't create %s:\n%s" % (output_path, e))
68
        sys.exit("Error: Couldn't create %s:\n%s" % (output_path, e))
70
69
71
    # Count projects and create sitemap index file
72
    num_projects = M.Project.query.find().count()
73
    now = datetime.utcnow().date()
70
    now = datetime.utcnow().date()
74
    offsets = [i for i in range(0, num_projects, PROJECTS_PER_FILE)]
71
    sitemap_content_template = Template(SITEMAP_TEMPLATE)
75
    sitemap_index_vars = dict(
72
    def write_sitemap(urls, file_no):
76
        now=now,
73
        sitemap_content = sitemap_content_template.render(dict(
77
        sitemaps = [
74
            now=now, locs=urls))
78
            '%s/allura_sitemap/sitemap-%d.xml' % (BASE_URL, offset)
79
            for offset in offsets])
80
    sitemap_index_content = Template(INDEX_TEMPLATE).render(sitemap_index_vars)
81
    with open(os.path.join(output_path, 'sitemap.xml'), 'w') as f:
75
        with open(os.path.join(output_path, 'sitemap-%d.xml' % file_no), 'w') as f:
82
        f.write(sitemap_index_content)
76
            f.write(sitemap_content)
83
77
84
    # Create urlset file for each chunk of PROJECTS_PER_FILE projects
85
    sitemap_content_template = Template(SITEMAP_TEMPLATE)
86
    creds = security.Credentials.get()
78
    creds = security.Credentials.get()
87
    for offset in offsets:
88
        locs = []
79
    locs = []
89
        for p in M.Project.query.find().skip(offset).limit(PROJECTS_PER_FILE):
80
    file_count = 0
81
    # write sitemap files, MAX_SITEMAP_URLS per file
82
    for chunk in utils.chunked_find(M.Project):
83
        for p in chunk:
90
            c.project = p
84
            c.project = p
91
            try:
85
            try:
92
                locs += [BASE_URL + s.url for s in p.sitemap()]
86
                locs += [BASE_URL + s.url if s.url[0] == '/' else s.url
87
                         for s in p.sitemap()]
93
            except Exception, e:
88
            except Exception, e:
94
                print "Error creating sitemap for project '%s': %s" %\
89
                print "Error creating sitemap for project '%s': %s" %\
95
                      (p.shortname, e)
90
                    (p.shortname, e)
96
            creds.clear()
91
            creds.clear()
97
        sitemap_vars = dict(now=now, locs=locs)
92
            if len(locs) >= options.urls_per_file:
98
        sitemap_content = sitemap_content_template.render(sitemap_vars)
93
                write_sitemap(locs[:options.urls_per_file], file_count)
99
        with open(os.path.join(output_path, 'sitemap-%d.xml' % offset), 'w') as f:
94
                del locs[:options.urls_per_file]
100
            f.write(sitemap_content)
95
                file_count += 1
101
        session(p).clear()
96
            session(p).clear()
102
        ThreadLocalORMSession.close_all()
97
        ThreadLocalORMSession.close_all()
98
    while locs:
99
        write_sitemap(locs[:options.urls_per_file], file_count)
100
        del locs[:options.urls_per_file]
101
        file_count += 1
102
    # write sitemap index file
103
    if file_count:
104
        sitemap_index_vars = dict(
105
            now=now,
106
            sitemaps = [
107
                '%s/allura_sitemap/sitemap-%d.xml' % (BASE_URL, n)
108
                for n in range(file_count)])
109
        sitemap_index_content = Template(INDEX_TEMPLATE).render(sitemap_index_vars)
110
        with open(os.path.join(output_path, 'sitemap.xml'), 'w') as f:
111
            f.write(sitemap_index_content)
103
112
104
def parse_options():
113
def parse_options():
114
    def validate(option, opt_str, value, parser):
115
        parser.values.urls_per_file = min(value, MAX_SITEMAP_URLS)
116
105
    from optparse import OptionParser
117
    from optparse import OptionParser
106
    optparser = OptionParser(
118
    optparser = OptionParser(
107
        usage='allurapaste script /var/local/config/production.ini '
119
        usage='allurapaste script /var/local/config/production.ini '
108
              '-- %prog [OPTIONS]')
120
              '-- %prog [OPTIONS]')
109
    optparser.add_option('-o', '--output-dir', dest='output_dir',
121
    optparser.add_option('-o', '--output-dir', dest='output_dir',
110
                         default='/tmp/allura_sitemap',
122
                         default='/tmp/allura_sitemap',
111
                         help='Output directory (absolute path).'
123
                         help='Output directory (absolute path).'
112
                              'Default is /tmp/allura_sitemap.')
124
                              '[default: %default]')
125
    optparser.add_option('-u', '--urls-per-file', dest='urls_per_file',
126
                         default=10000, type='int',
127
                         help='Number of URLs per sitemap file. '
128
                         '[default: %default, max: ' +
129
                         str(MAX_SITEMAP_URLS) + ']',
130
                         action='callback', callback=validate)
113
    options, args = optparser.parse_args()
131
    options, args = optparser.parse_args()
114
    return options, args
132
    return options, args
115
133
116
if __name__ == '__main__':
134
if __name__ == '__main__':
117
    options, args = parse_options()
135
    options, args = parse_options()