|
a |
|
b/trac-import/TracExport.py |
|
|
1 |
from pprint import pprint
|
|
|
2 |
import csv
|
|
|
3 |
import urllib2
|
|
|
4 |
from cStringIO import StringIO
|
|
|
5 |
import json
|
|
|
6 |
import time
|
|
|
7 |
|
|
|
8 |
import feedparser
|
|
|
9 |
from html2text import html2text
|
|
|
10 |
|
|
|
11 |
from allura.lib import rest_api
|
|
|
12 |
|
|
|
13 |
|
|
|
14 |
class TracExport(object):
|
|
|
15 |
|
|
|
16 |
TICKET_URL = '/ticket/%d'
|
|
|
17 |
QUERY_MAX_ID_URL = '/query?col=id&order=id&desc=1&max=2'
|
|
|
18 |
QUERY_BY_PAGE_URL = '/query?col=id&order=id&max=100&page=%d'
|
|
|
19 |
|
|
|
20 |
FIELD_MAP = {
|
|
|
21 |
'reporter': 'submitter',
|
|
|
22 |
'owner': 'assigned_to',
|
|
|
23 |
}
|
|
|
24 |
|
|
|
25 |
def __init__(self, base_url):
|
|
|
26 |
self.base_url = base_url
|
|
|
27 |
|
|
|
28 |
def remap_fields(self, dict):
|
|
|
29 |
"Remap fields to adhere to standard taxonomy."
|
|
|
30 |
out = {}
|
|
|
31 |
for k, v in dict.iteritems():
|
|
|
32 |
out[self.FIELD_MAP.get(k, k)] = v
|
|
|
33 |
|
|
|
34 |
if 'private' in out:
|
|
|
35 |
out['private'] = bool(int(out['private']))
|
|
|
36 |
return out
|
|
|
37 |
|
|
|
38 |
def full_url(self, suburl, type):
|
|
|
39 |
glue = '&' if '?' in suburl else '?'
|
|
|
40 |
return self.base_url + suburl + glue + 'format=' + type
|
|
|
41 |
|
|
|
42 |
def csvopen(self, url):
|
|
|
43 |
print url
|
|
|
44 |
f = urllib2.urlopen(url)
|
|
|
45 |
# Trac doesn't throw 403 error, just shows normal 200 HTML page
|
|
|
46 |
# telling that access denied. So, we'll emulate 403 ourselves.
|
|
|
47 |
# TODO: currently, any non-csv result treated as 403.
|
|
|
48 |
if not f.info()['Content-Type'].startswith('text/csv'):
|
|
|
49 |
raise urllib2.HTTPError(url, 403, 'Forbidden - emulated', f.info(), f)
|
|
|
50 |
return f
|
|
|
51 |
|
|
|
52 |
def parse_ticket_body(self, id):
|
|
|
53 |
# Use CSV export to get ticket fields
|
|
|
54 |
url = self.full_url(self.TICKET_URL % id, 'csv')
|
|
|
55 |
f = self.csvopen(url)
|
|
|
56 |
reader = csv.DictReader(f)
|
|
|
57 |
ticket_fields = reader.next()
|
|
|
58 |
ticket_fields['class'] = 'ARTIFACT'
|
|
|
59 |
return self.remap_fields(ticket_fields)
|
|
|
60 |
|
|
|
61 |
def parse_ticket_comments(self, id):
|
|
|
62 |
# Use RSS export to get ticket comments
|
|
|
63 |
d = feedparser.parse(self.full_url(self.TICKET_URL % id, 'rss'))
|
|
|
64 |
# pprint.pprint(d['entries'])
|
|
|
65 |
res = []
|
|
|
66 |
for comment in d['entries']:
|
|
|
67 |
c = {}
|
|
|
68 |
c['submitter'] = comment.author
|
|
|
69 |
c['date'] = comment.updated_parsed
|
|
|
70 |
c['comment'] = html2text(comment.summary)
|
|
|
71 |
c['class'] = 'COMMENT'
|
|
|
72 |
res.append(c)
|
|
|
73 |
return res
|
|
|
74 |
|
|
|
75 |
def get_ticket(self, id):
|
|
|
76 |
t = self.parse_ticket_body(id)
|
|
|
77 |
t['comments'] = self.parse_ticket_comments(id)
|
|
|
78 |
return t
|
|
|
79 |
|
|
|
80 |
def get_ticket_ids_csv(self):
|
|
|
81 |
url = self.full_url(self.QUERY_URL, 'csv')
|
|
|
82 |
print url
|
|
|
83 |
f = urllib2.urlopen(url, timeout=None)
|
|
|
84 |
reader = csv.reader(f)
|
|
|
85 |
cols = reader.next()
|
|
|
86 |
ids = [r for r in reader]
|
|
|
87 |
return ids
|
|
|
88 |
|
|
|
89 |
def get_ticket_ids(self):
|
|
|
90 |
# As Trac has only one tracker, and number ticket sequantally,
|
|
|
91 |
# We have two choices here:
|
|
|
92 |
# 1. Get the last existing ticket id, and just make sequence
|
|
|
93 |
# from 1 to that id. But then we should be ready to teh fact
|
|
|
94 |
# that some tickets in this range will be unavailable.
|
|
|
95 |
# 2. Export artifact list and get ids which are really accessible
|
|
|
96 |
# to the current user.
|
|
|
97 |
# It turns out that we need to paginate artifact list, so it's
|
|
|
98 |
# some time and extra traffic, so first method used by default.
|
|
|
99 |
if False:
|
|
|
100 |
max = self.get_max_ticket_id()
|
|
|
101 |
return xrange(1, max + 1)
|
|
|
102 |
else:
|
|
|
103 |
return self.enumerate_ticket_ids()
|
|
|
104 |
|
|
|
105 |
def get_max_ticket_id(self):
|
|
|
106 |
url = self.full_url(self.QUERY_MAX_ID_URL, 'csv')
|
|
|
107 |
f = self.csvopen(url)
|
|
|
108 |
reader = csv.DictReader(f)
|
|
|
109 |
fields = reader.next()
|
|
|
110 |
print fields
|
|
|
111 |
return int(fields['id'])
|
|
|
112 |
|
|
|
113 |
def enumerate_ticket_ids(self, page=1):
|
|
|
114 |
'Go thru ticket list and collect available ticket ids.'
|
|
|
115 |
# We could just do CSV export, which by default dumps entire list
|
|
|
116 |
# Alas, for many busy servers with long ticket list, it will just
|
|
|
117 |
# time out. So, let's paginate it instead.
|
|
|
118 |
res = []
|
|
|
119 |
while True:
|
|
|
120 |
url = self.full_url(self.QUERY_BY_PAGE_URL % page, 'csv')
|
|
|
121 |
try:
|
|
|
122 |
f = self.csvopen(url)
|
|
|
123 |
except urllib2.HTTPError, e:
|
|
|
124 |
if 'emulated' in e.msg:
|
|
|
125 |
body = e.fp.read()
|
|
|
126 |
if 'beyond the number of pages in the query' in body:
|
|
|
127 |
break
|
|
|
128 |
raise
|
|
|
129 |
reader = csv.reader(f)
|
|
|
130 |
cols = reader.next()
|
|
|
131 |
ids = [int(r[0]) for r in reader if r and r[0][0].isdigit()]
|
|
|
132 |
res += ids
|
|
|
133 |
page += 1
|
|
|
134 |
|
|
|
135 |
return res
|
|
|
136 |
|
|
|
137 |
class DateJSONEncoder(json.JSONEncoder):
|
|
|
138 |
def default(self, obj):
|
|
|
139 |
if isinstance(obj, time.struct_time):
|
|
|
140 |
return time.strftime('%Y-%m-%dT%H:%M:%SZ', obj)
|
|
|
141 |
return json.JSONEncoder.default(self, obj)
|
|
|
142 |
|
|
|
143 |
if __name__ == '__main__':
|
|
|
144 |
TRAC_BASE_URL = 'http://sourceforge.net/apps/trac/sourceforge'
|
|
|
145 |
ex = TracExport(TRAC_BASE_URL)
|
|
|
146 |
# d = ex.parse_ticket_body(9)
|
|
|
147 |
# pprint(d)
|
|
|
148 |
# d = ex.parse_ticket_comments(9)
|
|
|
149 |
# pprint(d)
|
|
|
150 |
# d = ex.get_ticket(9)
|
|
|
151 |
# pprint(d)
|
|
|
152 |
# d = ex.get_max_ticket_id()
|
|
|
153 |
# d = ex.get_ticket_ids()
|
|
|
154 |
ids = [3]
|
|
|
155 |
doc = [ex.get_ticket(i) for i in ids]
|
|
|
156 |
print json.dumps(doc, cls=DateJSONEncoder, indent=2)
|
|
|
157 |
|