opensourceprojects.eu / Allura-Contrib / Diff of /scripts/trac

Diff of /scripts/trac_export.py [584a72] .. [f0d671]

Switch to unified view


...
#       "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
#       KIND, either express or implied.  See the License for the
#       specific language governing permissions and limitations
#       under the License.















































































































































































































































if __name__ == '__main__':
    from allura.scripts.trac_export import main
    main()
















	a/scripts/trac_export.py		b/scripts/trac_export.py
	...		...
15	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY	15	# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16	# KIND, either express or implied. See the License for the	16	# KIND, either express or implied. See the License for the
17	# specific language governing permissions and limitations	17	# specific language governing permissions and limitations
18	# under the License.	18	# under the License.
19		19
20
21	import sys
22	import csv
23	import urlparse
24	import urllib2
25	import json
26	import time
27	import re
28	from optparse import OptionParser
29	from itertools import islice
30	from datetime import datetime
31
32	import feedparser
33	from html2text import html2text
34	from BeautifulSoup import BeautifulSoup, NavigableString
35	import dateutil.parser
36	import pytz
37
38
39	def parse_options():
40	optparser = OptionParser(usage=''' %prog <Trac URL>
41
42	Export ticket data from a Trac instance''')
43	optparser.add_option('-o', '--out-file', dest='out_filename', help='Write to file (default stdout)')
44	optparser.add_option('--no-attachments', dest='do_attachments', action='store_false', default=True, help='Export attachment info')
45	optparser.add_option('--only-tickets', dest='only_tickets', action='store_true', help='Export only ticket list')
46	optparser.add_option('--start', dest='start_id', type='int', default=1, help='Start with given ticket numer (or next accessible)')
47	optparser.add_option('--limit', dest='limit', type='int', default=None, help='Limit number of tickets')
48	optparser.add_option('-v', '--verbose', dest='verbose', action='store_true', help='Verbose operation')
49	options, args = optparser.parse_args()
50	if len(args) != 1:
51	optparser.error("Wrong number of arguments.")
52	return options, args
53
54
55	class TracExport(object):
56
57	PAGE_SIZE = 100
58	TICKET_URL = 'ticket/%d'
59	QUERY_MAX_ID_URL = 'query?col=id&order=id&desc=1&max=2'
60	QUERY_BY_PAGE_URL = 'query?col=id&col=time&col=changetime&order=id&max=' + str(PAGE_SIZE)+ '&page=%d'
61	ATTACHMENT_LIST_URL = 'attachment/ticket/%d/'
62	ATTACHMENT_URL = 'raw-attachment/ticket/%d/%s'
63
64	FIELD_MAP = {
65	'reporter': 'submitter',
66	'owner': 'assigned_to',
67	}
68
69	def __init__(self, base_url, start_id=1):
70	"""start_id - start with at least that ticket number (actual returned
71	ticket may have higher id if we don't have access to exact
72	one).
73	"""
74	self.base_url = base_url.rstrip('/') + '/'
75	# Contains additional info for a ticket which cannot
76	# be get with single-ticket export (create/mod times is
77	# and example).
78	self.ticket_map = {}
79	self.start_id = start_id
80	self.page = (start_id - 1) / self.PAGE_SIZE + 1
81	self.ticket_queue = self.next_ticket_ids()
82
83	def remap_fields(self, dict):
84	"Remap fields to adhere to standard taxonomy."
85	out = {}
86	for k, v in dict.iteritems():
87	out[self.FIELD_MAP.get(k, k)] = v
88
89	out['id'] = int(out['id'])
90	if 'private' in out:
91	out['private'] = bool(int(out['private']))
92	return out
93
94	def full_url(self, suburl, type=None):
95	url = urlparse.urljoin(self.base_url, suburl)
96	if type is None:
97	return url
98	glue = '&' if '?' in suburl else '?'
99	return url + glue + 'format=' + type
100
101	@staticmethod
102	def log_url(url):
103	if options.verbose:
104	print >>sys.stderr, url
105
106	@classmethod
107	def trac2z_date(cls, s):
108	d = dateutil.parser.parse(s)
109	d = d.astimezone(pytz.UTC)
110	return d.strftime("%Y-%m-%dT%H:%M:%SZ")
111
112	@staticmethod
113	def match_pattern(regexp, string):
114	m = re.match(regexp, string)
115	assert m
116	return m.group(1)
117
118	def csvopen(self, url):
119	self.log_url(url)
120	f = urllib2.urlopen(url)
121	# Trac doesn't throw 403 error, just shows normal 200 HTML page
122	# telling that access denied. So, we'll emulate 403 ourselves.
123	# TODO: currently, any non-csv result treated as 403.
124	if not f.info()['Content-Type'].startswith('text/csv'):
125	raise urllib2.HTTPError(url, 403, 'Forbidden - emulated', f.info(), f)
126	return f
127
128	def parse_ticket_body(self, id):
129	# Use CSV export to get ticket fields
130	url = self.full_url(self.TICKET_URL % id, 'csv')
131	f = self.csvopen(url)
132	reader = csv.DictReader(f)
133	ticket_fields = reader.next()
134	ticket_fields['class'] = 'ARTIFACT'
135	return self.remap_fields(ticket_fields)
136
137	def parse_ticket_comments(self, id):
138	# Use RSS export to get ticket comments
139	url = self.full_url(self.TICKET_URL % id, 'rss')
140	self.log_url(url)
141	d = feedparser.parse(url)
142	res = []
143	for comment in d['entries']:
144	c = {}
145	c['submitter'] = comment.author
146	c['date'] = comment.updated_parsed
147	c['comment'] = html2text(comment.summary)
148	c['class'] = 'COMMENT'
149	res.append(c)
150	return res
151
152	def parse_ticket_attachments(self, id):
153	SIZE_PATTERN = r'(\d+) bytes'
154	TIMESTAMP_PATTERN = r'(.+) in Timeline'
155	# Scrape HTML to get ticket attachments
156	url = self.full_url(self.ATTACHMENT_LIST_URL % id)
157	self.log_url(url)
158	f = urllib2.urlopen(url)
159	soup = BeautifulSoup(f)
160	attach = soup.find('div', id='attachments')
161	list = []
162	while attach:
163	attach = attach.findNext('dt')
164	if not attach:
165	break
166	d = {}
167	d['filename'] = attach.a['href'].rsplit('/', 1)[1]
168	d['url'] = self.full_url(self.ATTACHMENT_URL % (id, d['filename']))
169	size_s = attach.span['title']
170	d['size'] = int(self.match_pattern(SIZE_PATTERN, size_s))
171	timestamp_s = attach.find('a', {'class': 'timeline'})['title']
172	d['date'] = self.trac2z_date(self.match_pattern(TIMESTAMP_PATTERN, timestamp_s))
173	d['by'] = attach.find(text=re.compile('added by')).nextSibling.renderContents()
174	d['description'] = ''
175	# Skip whitespace
176	while attach.nextSibling and type(attach.nextSibling) is NavigableString:
177	attach = attach.nextSibling
178	# if there's a description, there will be a <dd> element, other immediately next <dt>
179	if attach.nextSibling and attach.nextSibling.name == 'dd':
180	desc_el = attach.nextSibling
181	if desc_el:
182	# TODO: Convert to Allura link syntax as needed
183	d['description'] = ''.join(desc_el.findAll(text=True)).strip()
184	list.append(d)
185	return list
186
187	def get_max_ticket_id(self):
188	url = self.full_url(self.QUERY_MAX_ID_URL, 'csv')
189	f = self.csvopen(url)
190	reader = csv.DictReader(f)
191	fields = reader.next()
192	print fields
193	return int(fields['id'])
194
195	def get_ticket(self, id, extra={}):
196	'''Get ticket with given id
197	extra: extra fields to add to ticket (parsed elsewhere)
198	'''
199	t = self.parse_ticket_body(id)
200	t['comments'] = self.parse_ticket_comments(id)
201	if options.do_attachments:
202	atts = self.parse_ticket_attachments(id)
203	if atts:
204	t['attachments'] = atts
205	t.update(extra)
206	return t
207
208	def next_ticket_ids(self):
209	'Go thru ticket list and collect available ticket ids.'
210	# We could just do CSV export, which by default dumps entire list
211	# Alas, for many busy servers with long ticket list, it will just
212	# time out. So, let's paginate it instead.
213	res = []
214
215	url = self.full_url(self.QUERY_BY_PAGE_URL % self.page, 'csv')
216	try:
217	f = self.csvopen(url)
218	except urllib2.HTTPError, e:
219	if 'emulated' in e.msg:
220	body = e.fp.read()
221	if 'beyond the number of pages in the query' in body or 'Log in with a SourceForge account' in body:
222	raise StopIteration
223	raise
224	reader = csv.reader(f)
225	cols = reader.next()
226	for r in reader:
227	if r and r[0].isdigit():
228	id = int(r[0])
229	extra = {'date': self.trac2z_date(r[1]), 'date_updated': self.trac2z_date(r[2])}
230	res.append((id, extra))
231	self.page += 1
232
233	return res
234
235	def __iter__(self):
236	return self
237
238	def next(self):
239	while True:
240	# queue empty, try to fetch more
241	if len(self.ticket_queue) == 0:
242	self.ticket_queue = self.next_ticket_ids()
243	# there aren't any more, we're really done
244	if len(self.ticket_queue) == 0:
245	raise StopIteration
246	id, extra = self.ticket_queue.pop(0)
247	if id >= self.start_id:
248	break
249	return self.get_ticket(id, extra)
250
251
252	class DateJSONEncoder(json.JSONEncoder):
253	def default(self, obj):
254	if isinstance(obj, time.struct_time):
255	return time.strftime('%Y-%m-%dT%H:%M:%SZ', obj)
256	return json.JSONEncoder.default(self, obj)
257
258	if __name__ == '__main__':	20	if __name__ == '__main__':
259	options, args = parse_options()	21	from allura.scripts.trac_export import main
260	ex = TracExport(args[0], start_id=options.start_id)	22	main()
261	# Implement iterator sequence limiting using islice()
262	doc = [t for t in islice(ex, options.limit)]
263
264	if not options.only_tickets:
265	doc = {
266	'class': 'PROJECT',
267	'trackers': {'default': {'artifacts': doc}}
268	}
269
270	out_file = sys.stdout
271	if options.out_filename:
272	out_file = open(options.out_filename, 'w')
273	out_file.write(json.dumps(doc, cls=DateJSONEncoder, indent=2, sort_keys=True))
274	# It's bad habit not to terminate lines
275	out_file.write('\n')