|
a/scripts/create-allura-sitemap.py |
|
b/scripts/create-allura-sitemap.py |
|
... |
|
... |
17 |
|
17 |
|
18 |
import pylons, webob
|
18 |
import pylons, webob
|
19 |
from pylons import c
|
19 |
from pylons import c
|
20 |
|
20 |
|
21 |
from allura import model as M
|
21 |
from allura import model as M
|
22 |
from allura.lib import security
|
22 |
from allura.lib import security, utils
|
23 |
from ming.orm import session, ThreadLocalORMSession
|
23 |
from ming.orm import session, ThreadLocalORMSession
|
24 |
|
24 |
|
25 |
PROJECTS_PER_FILE = 1000
|
25 |
MAX_SITEMAP_URLS = 50000
|
26 |
BASE_URL = 'http://sourceforge.net'
|
26 |
BASE_URL = 'http://sourceforge.net'
|
27 |
|
27 |
|
28 |
INDEX_TEMPLATE = """\
|
28 |
INDEX_TEMPLATE = """\
|
29 |
<?xml version="1.0" encoding="utf-8"?>
|
29 |
<?xml version="1.0" encoding="utf-8"?>
|
30 |
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
30 |
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
|
31 |
{% for sitemap in sitemaps -%}
|
31 |
{% for sitemap in sitemaps -%}
|
32 |
<sitemap>
|
32 |
<sitemap>
|
33 |
<loc>{{ sitemap }}</loc>
|
33 |
<loc>{{ sitemap }}</loc>
|
34 |
<lastmod>{{ now }}</lastmod>
|
34 |
<lastmod>{{ now }}</lastmod>
|
35 |
<changefreq>daily</changefreq>
|
|
|
36 |
</sitemap>
|
35 |
</sitemap>
|
37 |
{%- endfor %}
|
36 |
{%- endfor %}
|
38 |
</sitemapindex>
|
37 |
</sitemapindex>
|
39 |
"""
|
38 |
"""
|
40 |
|
39 |
|
|
... |
|
... |
66 |
try:
|
65 |
try:
|
67 |
os.mkdir(output_path)
|
66 |
os.mkdir(output_path)
|
68 |
except OSError, e:
|
67 |
except OSError, e:
|
69 |
sys.exit("Error: Couldn't create %s:\n%s" % (output_path, e))
|
68 |
sys.exit("Error: Couldn't create %s:\n%s" % (output_path, e))
|
70 |
|
69 |
|
71 |
# Count projects and create sitemap index file
|
|
|
72 |
num_projects = M.Project.query.find().count()
|
|
|
73 |
now = datetime.utcnow().date()
|
70 |
now = datetime.utcnow().date()
|
74 |
offsets = [i for i in range(0, num_projects, PROJECTS_PER_FILE)]
|
71 |
sitemap_content_template = Template(SITEMAP_TEMPLATE)
|
75 |
sitemap_index_vars = dict(
|
72 |
def write_sitemap(urls, file_no):
|
76 |
now=now,
|
73 |
sitemap_content = sitemap_content_template.render(dict(
|
77 |
sitemaps = [
|
74 |
now=now, locs=urls))
|
78 |
'%s/allura_sitemap/sitemap-%d.xml' % (BASE_URL, offset)
|
|
|
79 |
for offset in offsets])
|
|
|
80 |
sitemap_index_content = Template(INDEX_TEMPLATE).render(sitemap_index_vars)
|
|
|
81 |
with open(os.path.join(output_path, 'sitemap.xml'), 'w') as f:
|
75 |
with open(os.path.join(output_path, 'sitemap-%d.xml' % file_no), 'w') as f:
|
82 |
f.write(sitemap_index_content)
|
76 |
f.write(sitemap_content)
|
83 |
|
77 |
|
84 |
# Create urlset file for each chunk of PROJECTS_PER_FILE projects
|
|
|
85 |
sitemap_content_template = Template(SITEMAP_TEMPLATE)
|
|
|
86 |
creds = security.Credentials.get()
|
78 |
creds = security.Credentials.get()
|
87 |
for offset in offsets:
|
|
|
88 |
locs = []
|
79 |
locs = []
|
89 |
for p in M.Project.query.find().skip(offset).limit(PROJECTS_PER_FILE):
|
80 |
file_count = 0
|
|
|
81 |
# write sitemap files, MAX_SITEMAP_URLS per file
|
|
|
82 |
for chunk in utils.chunked_find(M.Project):
|
|
|
83 |
for p in chunk:
|
90 |
c.project = p
|
84 |
c.project = p
|
91 |
try:
|
85 |
try:
|
92 |
locs += [BASE_URL + s.url for s in p.sitemap()]
|
86 |
locs += [BASE_URL + s.url if s.url[0] == '/' else s.url
|
|
|
87 |
for s in p.sitemap()]
|
93 |
except Exception, e:
|
88 |
except Exception, e:
|
94 |
print "Error creating sitemap for project '%s': %s" %\
|
89 |
print "Error creating sitemap for project '%s': %s" %\
|
95 |
(p.shortname, e)
|
90 |
(p.shortname, e)
|
96 |
creds.clear()
|
91 |
creds.clear()
|
97 |
sitemap_vars = dict(now=now, locs=locs)
|
92 |
if len(locs) >= options.urls_per_file:
|
98 |
sitemap_content = sitemap_content_template.render(sitemap_vars)
|
93 |
write_sitemap(locs[:options.urls_per_file], file_count)
|
99 |
with open(os.path.join(output_path, 'sitemap-%d.xml' % offset), 'w') as f:
|
94 |
del locs[:options.urls_per_file]
|
100 |
f.write(sitemap_content)
|
95 |
file_count += 1
|
101 |
session(p).clear()
|
96 |
session(p).clear()
|
102 |
ThreadLocalORMSession.close_all()
|
97 |
ThreadLocalORMSession.close_all()
|
|
|
98 |
while locs:
|
|
|
99 |
write_sitemap(locs[:options.urls_per_file], file_count)
|
|
|
100 |
del locs[:options.urls_per_file]
|
|
|
101 |
file_count += 1
|
|
|
102 |
# write sitemap index file
|
|
|
103 |
if file_count:
|
|
|
104 |
sitemap_index_vars = dict(
|
|
|
105 |
now=now,
|
|
|
106 |
sitemaps = [
|
|
|
107 |
'%s/allura_sitemap/sitemap-%d.xml' % (BASE_URL, n)
|
|
|
108 |
for n in range(file_count)])
|
|
|
109 |
sitemap_index_content = Template(INDEX_TEMPLATE).render(sitemap_index_vars)
|
|
|
110 |
with open(os.path.join(output_path, 'sitemap.xml'), 'w') as f:
|
|
|
111 |
f.write(sitemap_index_content)
|
103 |
|
112 |
|
104 |
def parse_options():
|
113 |
def parse_options():
|
|
|
114 |
def validate(option, opt_str, value, parser):
|
|
|
115 |
parser.values.urls_per_file = min(value, MAX_SITEMAP_URLS)
|
|
|
116 |
|
105 |
from optparse import OptionParser
|
117 |
from optparse import OptionParser
|
106 |
optparser = OptionParser(
|
118 |
optparser = OptionParser(
|
107 |
usage='allurapaste script /var/local/config/production.ini '
|
119 |
usage='allurapaste script /var/local/config/production.ini '
|
108 |
'-- %prog [OPTIONS]')
|
120 |
'-- %prog [OPTIONS]')
|
109 |
optparser.add_option('-o', '--output-dir', dest='output_dir',
|
121 |
optparser.add_option('-o', '--output-dir', dest='output_dir',
|
110 |
default='/tmp/allura_sitemap',
|
122 |
default='/tmp/allura_sitemap',
|
111 |
help='Output directory (absolute path).'
|
123 |
help='Output directory (absolute path).'
|
112 |
'Default is /tmp/allura_sitemap.')
|
124 |
'[default: %default]')
|
|
|
125 |
optparser.add_option('-u', '--urls-per-file', dest='urls_per_file',
|
|
|
126 |
default=10000, type='int',
|
|
|
127 |
help='Number of URLs per sitemap file. '
|
|
|
128 |
'[default: %default, max: ' +
|
|
|
129 |
str(MAX_SITEMAP_URLS) + ']',
|
|
|
130 |
action='callback', callback=validate)
|
113 |
options, args = optparser.parse_args()
|
131 |
options, args = optparser.parse_args()
|
114 |
return options, args
|
132 |
return options, args
|
115 |
|
133 |
|
116 |
if __name__ == '__main__':
|
134 |
if __name__ == '__main__':
|
117 |
options, args = parse_options()
|
135 |
options, args = parse_options()
|