|
a/scripts/trac_export.py |
|
b/scripts/trac_export.py |
|
... |
|
... |
15 |
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
15 |
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
16 |
# KIND, either express or implied. See the License for the
|
16 |
# KIND, either express or implied. See the License for the
|
17 |
# specific language governing permissions and limitations
|
17 |
# specific language governing permissions and limitations
|
18 |
# under the License.
|
18 |
# under the License.
|
19 |
|
19 |
|
20 |
|
|
|
21 |
import sys
|
|
|
22 |
import csv
|
|
|
23 |
import urlparse
|
|
|
24 |
import urllib2
|
|
|
25 |
import json
|
|
|
26 |
import time
|
|
|
27 |
import re
|
|
|
28 |
from optparse import OptionParser
|
|
|
29 |
from itertools import islice
|
|
|
30 |
from datetime import datetime
|
|
|
31 |
|
|
|
32 |
import feedparser
|
|
|
33 |
from html2text import html2text
|
|
|
34 |
from BeautifulSoup import BeautifulSoup, NavigableString
|
|
|
35 |
import dateutil.parser
|
|
|
36 |
import pytz
|
|
|
37 |
|
|
|
38 |
|
|
|
39 |
def parse_options():
|
|
|
40 |
optparser = OptionParser(usage=''' %prog <Trac URL>
|
|
|
41 |
|
|
|
42 |
Export ticket data from a Trac instance''')
|
|
|
43 |
optparser.add_option('-o', '--out-file', dest='out_filename', help='Write to file (default stdout)')
|
|
|
44 |
optparser.add_option('--no-attachments', dest='do_attachments', action='store_false', default=True, help='Export attachment info')
|
|
|
45 |
optparser.add_option('--only-tickets', dest='only_tickets', action='store_true', help='Export only ticket list')
|
|
|
46 |
optparser.add_option('--start', dest='start_id', type='int', default=1, help='Start with given ticket numer (or next accessible)')
|
|
|
47 |
optparser.add_option('--limit', dest='limit', type='int', default=None, help='Limit number of tickets')
|
|
|
48 |
optparser.add_option('-v', '--verbose', dest='verbose', action='store_true', help='Verbose operation')
|
|
|
49 |
options, args = optparser.parse_args()
|
|
|
50 |
if len(args) != 1:
|
|
|
51 |
optparser.error("Wrong number of arguments.")
|
|
|
52 |
return options, args
|
|
|
53 |
|
|
|
54 |
|
|
|
55 |
class TracExport(object):
|
|
|
56 |
|
|
|
57 |
PAGE_SIZE = 100
|
|
|
58 |
TICKET_URL = 'ticket/%d'
|
|
|
59 |
QUERY_MAX_ID_URL = 'query?col=id&order=id&desc=1&max=2'
|
|
|
60 |
QUERY_BY_PAGE_URL = 'query?col=id&col=time&col=changetime&order=id&max=' + str(PAGE_SIZE)+ '&page=%d'
|
|
|
61 |
ATTACHMENT_LIST_URL = 'attachment/ticket/%d/'
|
|
|
62 |
ATTACHMENT_URL = 'raw-attachment/ticket/%d/%s'
|
|
|
63 |
|
|
|
64 |
FIELD_MAP = {
|
|
|
65 |
'reporter': 'submitter',
|
|
|
66 |
'owner': 'assigned_to',
|
|
|
67 |
}
|
|
|
68 |
|
|
|
69 |
def __init__(self, base_url, start_id=1):
|
|
|
70 |
"""start_id - start with at least that ticket number (actual returned
|
|
|
71 |
ticket may have higher id if we don't have access to exact
|
|
|
72 |
one).
|
|
|
73 |
"""
|
|
|
74 |
self.base_url = base_url.rstrip('/') + '/'
|
|
|
75 |
# Contains additional info for a ticket which cannot
|
|
|
76 |
# be get with single-ticket export (create/mod times is
|
|
|
77 |
# and example).
|
|
|
78 |
self.ticket_map = {}
|
|
|
79 |
self.start_id = start_id
|
|
|
80 |
self.page = (start_id - 1) / self.PAGE_SIZE + 1
|
|
|
81 |
self.ticket_queue = self.next_ticket_ids()
|
|
|
82 |
|
|
|
83 |
def remap_fields(self, dict):
|
|
|
84 |
"Remap fields to adhere to standard taxonomy."
|
|
|
85 |
out = {}
|
|
|
86 |
for k, v in dict.iteritems():
|
|
|
87 |
out[self.FIELD_MAP.get(k, k)] = v
|
|
|
88 |
|
|
|
89 |
out['id'] = int(out['id'])
|
|
|
90 |
if 'private' in out:
|
|
|
91 |
out['private'] = bool(int(out['private']))
|
|
|
92 |
return out
|
|
|
93 |
|
|
|
94 |
def full_url(self, suburl, type=None):
|
|
|
95 |
url = urlparse.urljoin(self.base_url, suburl)
|
|
|
96 |
if type is None:
|
|
|
97 |
return url
|
|
|
98 |
glue = '&' if '?' in suburl else '?'
|
|
|
99 |
return url + glue + 'format=' + type
|
|
|
100 |
|
|
|
101 |
@staticmethod
|
|
|
102 |
def log_url(url):
|
|
|
103 |
if options.verbose:
|
|
|
104 |
print >>sys.stderr, url
|
|
|
105 |
|
|
|
106 |
@classmethod
|
|
|
107 |
def trac2z_date(cls, s):
|
|
|
108 |
d = dateutil.parser.parse(s)
|
|
|
109 |
d = d.astimezone(pytz.UTC)
|
|
|
110 |
return d.strftime("%Y-%m-%dT%H:%M:%SZ")
|
|
|
111 |
|
|
|
112 |
@staticmethod
|
|
|
113 |
def match_pattern(regexp, string):
|
|
|
114 |
m = re.match(regexp, string)
|
|
|
115 |
assert m
|
|
|
116 |
return m.group(1)
|
|
|
117 |
|
|
|
118 |
def csvopen(self, url):
|
|
|
119 |
self.log_url(url)
|
|
|
120 |
f = urllib2.urlopen(url)
|
|
|
121 |
# Trac doesn't throw 403 error, just shows normal 200 HTML page
|
|
|
122 |
# telling that access denied. So, we'll emulate 403 ourselves.
|
|
|
123 |
# TODO: currently, any non-csv result treated as 403.
|
|
|
124 |
if not f.info()['Content-Type'].startswith('text/csv'):
|
|
|
125 |
raise urllib2.HTTPError(url, 403, 'Forbidden - emulated', f.info(), f)
|
|
|
126 |
return f
|
|
|
127 |
|
|
|
128 |
def parse_ticket_body(self, id):
|
|
|
129 |
# Use CSV export to get ticket fields
|
|
|
130 |
url = self.full_url(self.TICKET_URL % id, 'csv')
|
|
|
131 |
f = self.csvopen(url)
|
|
|
132 |
reader = csv.DictReader(f)
|
|
|
133 |
ticket_fields = reader.next()
|
|
|
134 |
ticket_fields['class'] = 'ARTIFACT'
|
|
|
135 |
return self.remap_fields(ticket_fields)
|
|
|
136 |
|
|
|
137 |
def parse_ticket_comments(self, id):
|
|
|
138 |
# Use RSS export to get ticket comments
|
|
|
139 |
url = self.full_url(self.TICKET_URL % id, 'rss')
|
|
|
140 |
self.log_url(url)
|
|
|
141 |
d = feedparser.parse(url)
|
|
|
142 |
res = []
|
|
|
143 |
for comment in d['entries']:
|
|
|
144 |
c = {}
|
|
|
145 |
c['submitter'] = comment.author
|
|
|
146 |
c['date'] = comment.updated_parsed
|
|
|
147 |
c['comment'] = html2text(comment.summary)
|
|
|
148 |
c['class'] = 'COMMENT'
|
|
|
149 |
res.append(c)
|
|
|
150 |
return res
|
|
|
151 |
|
|
|
152 |
def parse_ticket_attachments(self, id):
|
|
|
153 |
SIZE_PATTERN = r'(\d+) bytes'
|
|
|
154 |
TIMESTAMP_PATTERN = r'(.+) in Timeline'
|
|
|
155 |
# Scrape HTML to get ticket attachments
|
|
|
156 |
url = self.full_url(self.ATTACHMENT_LIST_URL % id)
|
|
|
157 |
self.log_url(url)
|
|
|
158 |
f = urllib2.urlopen(url)
|
|
|
159 |
soup = BeautifulSoup(f)
|
|
|
160 |
attach = soup.find('div', id='attachments')
|
|
|
161 |
list = []
|
|
|
162 |
while attach:
|
|
|
163 |
attach = attach.findNext('dt')
|
|
|
164 |
if not attach:
|
|
|
165 |
break
|
|
|
166 |
d = {}
|
|
|
167 |
d['filename'] = attach.a['href'].rsplit('/', 1)[1]
|
|
|
168 |
d['url'] = self.full_url(self.ATTACHMENT_URL % (id, d['filename']))
|
|
|
169 |
size_s = attach.span['title']
|
|
|
170 |
d['size'] = int(self.match_pattern(SIZE_PATTERN, size_s))
|
|
|
171 |
timestamp_s = attach.find('a', {'class': 'timeline'})['title']
|
|
|
172 |
d['date'] = self.trac2z_date(self.match_pattern(TIMESTAMP_PATTERN, timestamp_s))
|
|
|
173 |
d['by'] = attach.find(text=re.compile('added by')).nextSibling.renderContents()
|
|
|
174 |
d['description'] = ''
|
|
|
175 |
# Skip whitespace
|
|
|
176 |
while attach.nextSibling and type(attach.nextSibling) is NavigableString:
|
|
|
177 |
attach = attach.nextSibling
|
|
|
178 |
# if there's a description, there will be a <dd> element, other immediately next <dt>
|
|
|
179 |
if attach.nextSibling and attach.nextSibling.name == 'dd':
|
|
|
180 |
desc_el = attach.nextSibling
|
|
|
181 |
if desc_el:
|
|
|
182 |
# TODO: Convert to Allura link syntax as needed
|
|
|
183 |
d['description'] = ''.join(desc_el.findAll(text=True)).strip()
|
|
|
184 |
list.append(d)
|
|
|
185 |
return list
|
|
|
186 |
|
|
|
187 |
def get_max_ticket_id(self):
|
|
|
188 |
url = self.full_url(self.QUERY_MAX_ID_URL, 'csv')
|
|
|
189 |
f = self.csvopen(url)
|
|
|
190 |
reader = csv.DictReader(f)
|
|
|
191 |
fields = reader.next()
|
|
|
192 |
print fields
|
|
|
193 |
return int(fields['id'])
|
|
|
194 |
|
|
|
195 |
def get_ticket(self, id, extra={}):
|
|
|
196 |
'''Get ticket with given id
|
|
|
197 |
extra: extra fields to add to ticket (parsed elsewhere)
|
|
|
198 |
'''
|
|
|
199 |
t = self.parse_ticket_body(id)
|
|
|
200 |
t['comments'] = self.parse_ticket_comments(id)
|
|
|
201 |
if options.do_attachments:
|
|
|
202 |
atts = self.parse_ticket_attachments(id)
|
|
|
203 |
if atts:
|
|
|
204 |
t['attachments'] = atts
|
|
|
205 |
t.update(extra)
|
|
|
206 |
return t
|
|
|
207 |
|
|
|
208 |
def next_ticket_ids(self):
|
|
|
209 |
'Go thru ticket list and collect available ticket ids.'
|
|
|
210 |
# We could just do CSV export, which by default dumps entire list
|
|
|
211 |
# Alas, for many busy servers with long ticket list, it will just
|
|
|
212 |
# time out. So, let's paginate it instead.
|
|
|
213 |
res = []
|
|
|
214 |
|
|
|
215 |
url = self.full_url(self.QUERY_BY_PAGE_URL % self.page, 'csv')
|
|
|
216 |
try:
|
|
|
217 |
f = self.csvopen(url)
|
|
|
218 |
except urllib2.HTTPError, e:
|
|
|
219 |
if 'emulated' in e.msg:
|
|
|
220 |
body = e.fp.read()
|
|
|
221 |
if 'beyond the number of pages in the query' in body or 'Log in with a SourceForge account' in body:
|
|
|
222 |
raise StopIteration
|
|
|
223 |
raise
|
|
|
224 |
reader = csv.reader(f)
|
|
|
225 |
cols = reader.next()
|
|
|
226 |
for r in reader:
|
|
|
227 |
if r and r[0].isdigit():
|
|
|
228 |
id = int(r[0])
|
|
|
229 |
extra = {'date': self.trac2z_date(r[1]), 'date_updated': self.trac2z_date(r[2])}
|
|
|
230 |
res.append((id, extra))
|
|
|
231 |
self.page += 1
|
|
|
232 |
|
|
|
233 |
return res
|
|
|
234 |
|
|
|
235 |
def __iter__(self):
|
|
|
236 |
return self
|
|
|
237 |
|
|
|
238 |
def next(self):
|
|
|
239 |
while True:
|
|
|
240 |
# queue empty, try to fetch more
|
|
|
241 |
if len(self.ticket_queue) == 0:
|
|
|
242 |
self.ticket_queue = self.next_ticket_ids()
|
|
|
243 |
# there aren't any more, we're really done
|
|
|
244 |
if len(self.ticket_queue) == 0:
|
|
|
245 |
raise StopIteration
|
|
|
246 |
id, extra = self.ticket_queue.pop(0)
|
|
|
247 |
if id >= self.start_id:
|
|
|
248 |
break
|
|
|
249 |
return self.get_ticket(id, extra)
|
|
|
250 |
|
|
|
251 |
|
|
|
252 |
class DateJSONEncoder(json.JSONEncoder):
|
|
|
253 |
def default(self, obj):
|
|
|
254 |
if isinstance(obj, time.struct_time):
|
|
|
255 |
return time.strftime('%Y-%m-%dT%H:%M:%SZ', obj)
|
|
|
256 |
return json.JSONEncoder.default(self, obj)
|
|
|
257 |
|
|
|
258 |
if __name__ == '__main__':
|
20 |
if __name__ == '__main__':
|
259 |
options, args = parse_options()
|
21 |
from allura.scripts.trac_export import main
|
260 |
ex = TracExport(args[0], start_id=options.start_id)
|
22 |
main()
|
261 |
# Implement iterator sequence limiting using islice()
|
|
|
262 |
doc = [t for t in islice(ex, options.limit)]
|
|
|
263 |
|
|
|
264 |
if not options.only_tickets:
|
|
|
265 |
doc = {
|
|
|
266 |
'class': 'PROJECT',
|
|
|
267 |
'trackers': {'default': {'artifacts': doc}}
|
|
|
268 |
}
|
|
|
269 |
|
|
|
270 |
out_file = sys.stdout
|
|
|
271 |
if options.out_filename:
|
|
|
272 |
out_file = open(options.out_filename, 'w')
|
|
|
273 |
out_file.write(json.dumps(doc, cls=DateJSONEncoder, indent=2, sort_keys=True))
|
|
|
274 |
# It's bad habit not to terminate lines
|
|
|
275 |
out_file.write('\n')
|
|
|