Switch to unified view

a/scripts/trac_export.py b/scripts/trac_export.py
...
...
15
#       "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15
#       "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
16
#       KIND, either express or implied.  See the License for the
16
#       KIND, either express or implied.  See the License for the
17
#       specific language governing permissions and limitations
17
#       specific language governing permissions and limitations
18
#       under the License.
18
#       under the License.
19
19
20
21
import sys
22
import csv
23
import urlparse
24
import urllib2
25
import json
26
import time
27
import re
28
from optparse import OptionParser
29
from itertools import islice
30
from datetime import datetime
31
32
import feedparser
33
from html2text import html2text
34
from BeautifulSoup import BeautifulSoup, NavigableString
35
import dateutil.parser
36
import pytz
37
38
39
def parse_options():
40
    optparser = OptionParser(usage=''' %prog <Trac URL>
41
42
Export ticket data from a Trac instance''')
43
    optparser.add_option('-o', '--out-file', dest='out_filename', help='Write to file (default stdout)')
44
    optparser.add_option('--no-attachments', dest='do_attachments', action='store_false', default=True, help='Export attachment info')
45
    optparser.add_option('--only-tickets', dest='only_tickets', action='store_true', help='Export only ticket list')
46
    optparser.add_option('--start', dest='start_id', type='int', default=1, help='Start with given ticket numer (or next accessible)')
47
    optparser.add_option('--limit', dest='limit', type='int', default=None, help='Limit number of tickets')
48
    optparser.add_option('-v', '--verbose', dest='verbose', action='store_true', help='Verbose operation')
49
    options, args = optparser.parse_args()
50
    if len(args) != 1:
51
        optparser.error("Wrong number of arguments.")
52
    return options, args
53
54
55
class TracExport(object):
56
57
    PAGE_SIZE = 100
58
    TICKET_URL = 'ticket/%d'
59
    QUERY_MAX_ID_URL  = 'query?col=id&order=id&desc=1&max=2'
60
    QUERY_BY_PAGE_URL = 'query?col=id&col=time&col=changetime&order=id&max=' + str(PAGE_SIZE)+ '&page=%d'
61
    ATTACHMENT_LIST_URL = 'attachment/ticket/%d/'
62
    ATTACHMENT_URL = 'raw-attachment/ticket/%d/%s'
63
64
    FIELD_MAP = {
65
        'reporter': 'submitter',
66
        'owner': 'assigned_to',
67
    }
68
69
    def __init__(self, base_url, start_id=1):
70
        """start_id - start with at least that ticket number (actual returned
71
                      ticket may have higher id if we don't have access to exact
72
                      one).
73
        """
74
        self.base_url = base_url.rstrip('/') + '/'
75
        # Contains additional info for a ticket which cannot
76
        # be get with single-ticket export (create/mod times is
77
        # and example).
78
        self.ticket_map = {}
79
        self.start_id = start_id
80
        self.page = (start_id - 1) / self.PAGE_SIZE + 1
81
        self.ticket_queue = self.next_ticket_ids()
82
83
    def remap_fields(self, dict):
84
        "Remap fields to adhere to standard taxonomy."
85
        out = {}
86
        for k, v in dict.iteritems():
87
            out[self.FIELD_MAP.get(k, k)] = v
88
89
        out['id'] = int(out['id'])
90
        if 'private' in out:
91
            out['private'] = bool(int(out['private']))
92
        return out
93
94
    def full_url(self, suburl, type=None):
95
        url = urlparse.urljoin(self.base_url, suburl)
96
        if type is None:
97
            return url
98
        glue = '&' if '?' in suburl else '?'
99
        return  url + glue + 'format=' + type
100
101
    @staticmethod
102
    def log_url(url):
103
        if options.verbose:
104
            print >>sys.stderr, url
105
106
    @classmethod
107
    def trac2z_date(cls, s):
108
        d = dateutil.parser.parse(s)
109
        d = d.astimezone(pytz.UTC)
110
        return d.strftime("%Y-%m-%dT%H:%M:%SZ")
111
112
    @staticmethod
113
    def match_pattern(regexp, string):
114
        m = re.match(regexp, string)
115
        assert m
116
        return m.group(1)
117
118
    def csvopen(self, url):
119
        self.log_url(url)
120
        f = urllib2.urlopen(url)
121
        # Trac doesn't throw 403 error, just shows normal 200 HTML page
122
        # telling that access denied. So, we'll emulate 403 ourselves.
123
        # TODO: currently, any non-csv result treated as 403.
124
        if not f.info()['Content-Type'].startswith('text/csv'):
125
            raise urllib2.HTTPError(url, 403, 'Forbidden - emulated', f.info(), f)
126
        return f
127
128
    def parse_ticket_body(self, id):
129
        # Use CSV export to get ticket fields
130
        url = self.full_url(self.TICKET_URL % id, 'csv')
131
        f = self.csvopen(url)
132
        reader = csv.DictReader(f)
133
        ticket_fields = reader.next()
134
        ticket_fields['class'] = 'ARTIFACT'
135
        return self.remap_fields(ticket_fields)
136
137
    def parse_ticket_comments(self, id):
138
        # Use RSS export to get ticket comments
139
        url = self.full_url(self.TICKET_URL % id, 'rss')
140
        self.log_url(url)
141
        d = feedparser.parse(url)
142
        res = []
143
        for comment in d['entries']:
144
            c = {}
145
            c['submitter'] = comment.author
146
            c['date'] = comment.updated_parsed
147
            c['comment'] = html2text(comment.summary)
148
            c['class'] = 'COMMENT'
149
            res.append(c)
150
        return res
151
152
    def parse_ticket_attachments(self, id):
153
        SIZE_PATTERN = r'(\d+) bytes'
154
        TIMESTAMP_PATTERN = r'(.+) in Timeline'
155
        # Scrape HTML to get ticket attachments
156
        url = self.full_url(self.ATTACHMENT_LIST_URL % id)
157
        self.log_url(url)
158
        f = urllib2.urlopen(url)
159
        soup = BeautifulSoup(f)
160
        attach = soup.find('div', id='attachments')
161
        list = []
162
        while attach:
163
            attach = attach.findNext('dt')
164
            if not attach:
165
                break
166
            d = {}
167
            d['filename'] = attach.a['href'].rsplit('/', 1)[1]
168
            d['url'] = self.full_url(self.ATTACHMENT_URL % (id, d['filename']))
169
            size_s = attach.span['title']
170
            d['size'] = int(self.match_pattern(SIZE_PATTERN, size_s))
171
            timestamp_s = attach.find('a', {'class': 'timeline'})['title']
172
            d['date'] = self.trac2z_date(self.match_pattern(TIMESTAMP_PATTERN, timestamp_s))
173
            d['by'] = attach.find(text=re.compile('added by')).nextSibling.renderContents()
174
            d['description'] = ''
175
            # Skip whitespace
176
            while attach.nextSibling and type(attach.nextSibling) is NavigableString:
177
                attach = attach.nextSibling
178
            # if there's a description, there will be a <dd> element, other immediately next <dt>
179
            if attach.nextSibling and attach.nextSibling.name == 'dd':
180
                desc_el = attach.nextSibling
181
                if desc_el:
182
                    # TODO: Convert to Allura link syntax as needed
183
                    d['description'] = ''.join(desc_el.findAll(text=True)).strip()
184
            list.append(d)
185
        return list
186
187
    def get_max_ticket_id(self):
188
        url = self.full_url(self.QUERY_MAX_ID_URL, 'csv')
189
        f = self.csvopen(url)
190
        reader = csv.DictReader(f)
191
        fields = reader.next()
192
        print fields
193
        return int(fields['id'])
194
195
    def get_ticket(self, id, extra={}):
196
        '''Get ticket with given id
197
        extra: extra fields to add to ticket (parsed elsewhere)
198
        '''
199
        t = self.parse_ticket_body(id)
200
        t['comments'] = self.parse_ticket_comments(id)
201
        if options.do_attachments:
202
            atts = self.parse_ticket_attachments(id)
203
            if atts:
204
                t['attachments'] = atts
205
        t.update(extra)
206
        return t
207
208
    def next_ticket_ids(self):
209
        'Go thru ticket list and collect available ticket ids.'
210
        # We could just do CSV export, which by default dumps entire list
211
        # Alas, for many busy servers with long ticket list, it will just
212
        # time out. So, let's paginate it instead.
213
        res = []
214
215
        url = self.full_url(self.QUERY_BY_PAGE_URL % self.page, 'csv')
216
        try:
217
            f = self.csvopen(url)
218
        except urllib2.HTTPError, e:
219
            if 'emulated' in e.msg:
220
                body = e.fp.read()
221
                if 'beyond the number of pages in the query' in body or 'Log in with a SourceForge account' in body:
222
                    raise StopIteration
223
            raise
224
        reader = csv.reader(f)
225
        cols = reader.next()
226
        for r in reader:
227
            if r and r[0].isdigit():
228
                id = int(r[0])
229
                extra = {'date': self.trac2z_date(r[1]), 'date_updated': self.trac2z_date(r[2])}
230
                res.append((id, extra))
231
        self.page += 1
232
233
        return res
234
235
    def __iter__(self):
236
        return self
237
238
    def next(self):
239
        while True:
240
            # queue empty, try to fetch more
241
            if len(self.ticket_queue) == 0:
242
                self.ticket_queue = self.next_ticket_ids()
243
            # there aren't any more, we're really done
244
            if len(self.ticket_queue) == 0:
245
                raise StopIteration
246
            id, extra = self.ticket_queue.pop(0)
247
            if id >= self.start_id:
248
                break
249
        return self.get_ticket(id, extra)
250
251
252
class DateJSONEncoder(json.JSONEncoder):
253
    def default(self, obj):
254
        if isinstance(obj, time.struct_time):
255
            return time.strftime('%Y-%m-%dT%H:%M:%SZ', obj)
256
        return json.JSONEncoder.default(self, obj)
257
258
if __name__ == '__main__':
20
if __name__ == '__main__':
259
    options, args = parse_options()
21
    from allura.scripts.trac_export import main
260
    ex = TracExport(args[0], start_id=options.start_id)
22
    main()
261
    # Implement iterator sequence limiting using islice()
262
    doc = [t for t in islice(ex, options.limit)]
263
264
    if not options.only_tickets:
265
        doc = {
266
            'class': 'PROJECT',
267
            'trackers': {'default': {'artifacts': doc}}
268
        }
269
270
    out_file = sys.stdout
271
    if options.out_filename:
272
        out_file = open(options.out_filename, 'w')
273
    out_file.write(json.dumps(doc, cls=DateJSONEncoder, indent=2, sort_keys=True))
274
    # It's bad habit not to terminate lines
275
    out_file.write('\n')