Switch to unified view

a b/trac-import/TracExport.py
1
from pprint import pprint
2
import csv
3
import urllib2
4
from cStringIO import StringIO
5
import json
6
import time
7
8
import feedparser
9
from html2text import html2text
10
11
from allura.lib import rest_api
12
13
14
class TracExport(object):
15
16
    TICKET_URL = '/ticket/%d'
17
    QUERY_MAX_ID_URL  = '/query?col=id&order=id&desc=1&max=2'
18
    QUERY_BY_PAGE_URL = '/query?col=id&order=id&max=100&page=%d'
19
20
    FIELD_MAP = {
21
        'reporter': 'submitter',
22
        'owner': 'assigned_to',
23
    }
24
25
    def __init__(self, base_url):
26
        self.base_url = base_url
27
28
    def remap_fields(self, dict):
29
        "Remap fields to adhere to standard taxonomy."
30
        out = {}
31
        for k, v in dict.iteritems():
32
            out[self.FIELD_MAP.get(k, k)] = v
33
            
34
        if 'private' in out:
35
            out['private'] = bool(int(out['private']))
36
        return out
37
38
    def full_url(self, suburl, type):
39
        glue = '&' if '?' in suburl else '?'
40
        return self.base_url + suburl + glue + 'format=' + type
41
42
    def csvopen(self, url):
43
        print url
44
        f = urllib2.urlopen(url)
45
        # Trac doesn't throw 403 error, just shows normal 200 HTML page
46
        # telling that access denied. So, we'll emulate 403 ourselves.
47
        # TODO: currently, any non-csv result treated as 403.
48
        if not f.info()['Content-Type'].startswith('text/csv'):
49
            raise urllib2.HTTPError(url, 403, 'Forbidden - emulated', f.info(), f)
50
        return f
51
52
    def parse_ticket_body(self, id):
53
        # Use CSV export to get ticket fields
54
        url = self.full_url(self.TICKET_URL % id, 'csv')
55
        f = self.csvopen(url)
56
        reader = csv.DictReader(f)
57
        ticket_fields = reader.next()
58
        ticket_fields['class'] = 'ARTIFACT'
59
        return self.remap_fields(ticket_fields)
60
61
    def parse_ticket_comments(self, id):
62
        # Use RSS export to get ticket comments
63
        d = feedparser.parse(self.full_url(self.TICKET_URL % id, 'rss'))
64
    #    pprint.pprint(d['entries'])
65
        res = []
66
        for comment in d['entries']:
67
            c = {}
68
            c['submitter'] = comment.author
69
            c['date'] = comment.updated_parsed
70
            c['comment'] = html2text(comment.summary)
71
            c['class'] = 'COMMENT'
72
            res.append(c)
73
        return res
74
75
    def get_ticket(self, id):
76
        t = self.parse_ticket_body(id)
77
        t['comments'] = self.parse_ticket_comments(id)
78
        return t
79
80
    def get_ticket_ids_csv(self):
81
        url = self.full_url(self.QUERY_URL, 'csv')
82
        print url
83
        f = urllib2.urlopen(url, timeout=None)
84
        reader = csv.reader(f)
85
        cols = reader.next()
86
        ids = [r for r in reader]
87
        return ids
88
89
    def get_ticket_ids(self):
90
        # As Trac has only one tracker, and number ticket sequantally, 
91
        # We have two choices here:
92
        # 1. Get the last existing ticket id, and just make sequence
93
        #    from 1 to that id. But then we should be ready to teh fact
94
        #    that some tickets in this range will be unavailable.
95
        # 2. Export artifact list and get ids which are really accessible
96
        #    to the current user.
97
        # It turns out that we need to paginate artifact list, so it's
98
        # some time and extra traffic, so first method used by default.
99
        if False:
100
            max = self.get_max_ticket_id()
101
            return xrange(1, max + 1)
102
        else:
103
            return self.enumerate_ticket_ids()
104
105
    def get_max_ticket_id(self):
106
        url = self.full_url(self.QUERY_MAX_ID_URL, 'csv')
107
        f = self.csvopen(url)
108
        reader = csv.DictReader(f)
109
        fields = reader.next()
110
        print fields
111
        return int(fields['id'])
112
        
113
    def enumerate_ticket_ids(self, page=1):
114
        'Go thru ticket list and collect available ticket ids.'
115
        # We could just do CSV export, which by default dumps entire list
116
        # Alas, for many busy servers with long ticket list, it will just 
117
        # time out. So, let's paginate it instead.
118
        res = []
119
        while True:
120
            url = self.full_url(self.QUERY_BY_PAGE_URL % page, 'csv')
121
            try:
122
                f = self.csvopen(url)
123
            except urllib2.HTTPError, e:
124
                if 'emulated' in e.msg:
125
                    body = e.fp.read()
126
                    if 'beyond the number of pages in the query' in body:
127
                        break
128
                raise
129
            reader = csv.reader(f)
130
            cols = reader.next()
131
            ids = [int(r[0]) for r in reader if r and r[0][0].isdigit()]
132
            res += ids
133
            page += 1
134
135
        return res
136
        
137
class DateJSONEncoder(json.JSONEncoder):
138
    def default(self, obj):
139
        if isinstance(obj, time.struct_time):
140
            return time.strftime('%Y-%m-%dT%H:%M:%SZ', obj)
141
        return json.JSONEncoder.default(self, obj)
142
143
if __name__ == '__main__':
144
    TRAC_BASE_URL = 'http://sourceforge.net/apps/trac/sourceforge'
145
    ex = TracExport(TRAC_BASE_URL)
146
#    d = ex.parse_ticket_body(9)
147
#    pprint(d)
148
#    d = ex.parse_ticket_comments(9)
149
#    pprint(d)
150
#    d = ex.get_ticket(9)
151
#    pprint(d)
152
#    d = ex.get_max_ticket_id()
153
#    d = ex.get_ticket_ids()
154
    ids = [3]
155
    doc = [ex.get_ticket(i) for i in ids]
156
    print json.dumps(doc, cls=DateJSONEncoder, indent=2)
157