Child: [b2e986] (diff)

Download this file

extractors.py    176 lines (149 with data), 6.4 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
import os
import shutil
import json
import hashlib
from allura.command import base as allura_base
class MediawikiExtractor(object):
"""Base class for MediaWiki data provider"""
def __init__(self, options):
self.options = options
if os.path.exists(self.options.dump_dir):
# clear dump_dir before extraction (there may be an old data)
shutil.rmtree(self.options.dump_dir)
os.makedirs(self.options.dump_dir)
def extract(self):
"""Extract pages with history, attachments, talk-pages, etc"""
raise NotImplementedError("subclass must override this")
class MySQLExtractor(MediawikiExtractor):
"""Extract MediaWiki data to json.
Use connection to MySQL database as a data source.
"""
def __init__(self, options):
super(MySQLExtractor, self).__init__(options)
self._connection = None
self.db_options = {
'host': self.options.host or 'localhost',
'user': self.options.user,
'passwd': self.options.password,
'db': self.options.db_name,
'port': self.options.port or 3306
}
def connection(self):
try:
import MySQLdb
except ImportError:
raise ImportError('GPL library MySQL-python is required for this operation')
if not self._connection:
try:
self._connection = MySQLdb.connect(**self.db_options)
except MySQLdb.DatabaseError, e:
allura_base.log.error("Can't connect to database: %s" % str(e))
exit(2)
return self._connection
def _save(self, content, *paths):
"""Save json to file in local filesystem"""
out_file = os.path.join(self.options.dump_dir, *paths)
if not os.path.exists(os.path.dirname(out_file)):
os.makedirs(os.path.dirname(out_file))
with open(out_file, 'w') as out:
out.write(content.encode('utf-8'))
def _save_attachment(self, filepath, *paths):
"""Save attachment in dump directory.
Copy from mediawiki dump directory to our internal dump directory.
args:
filepath - path to attachment in mediawiki dump.
*paths - path to internal dump directory.
"""
out_dir = os.path.join(self.options.dump_dir, *paths)
if not os.path.exists(out_dir):
os.makedirs(out_dir)
shutil.copy(filepath, out_dir)
def _pages(self):
"""Yield page_data for next wiki page"""
c = self.connection().cursor()
c.execute('select page.page_id, page.page_title '
'from page where page.page_namespace = 0')
for row in c:
_id, title = row
page_data = {
'page_id': _id,
'title': title,
}
yield page_data
def _history(self, page_id):
"""Yield page_data for next revision of wiki page"""
c = self.connection().cursor()
c.execute('select revision.rev_timestamp, text.old_text '
'from revision '
'left join text on revision.rev_text_id = text.old_id '
'where revision.rev_page = %s', page_id)
for row in c:
timestamp, text = row
page_data = {
'timestamp': timestamp,
'text': text or ''
}
yield page_data
def _talk(self, page_title):
"""Return page_data for talk page with `page_title` title"""
c = self.connection().cursor()
query_attrs = (page_title, 1) # page_namespace == 1 - talk pages
c.execute('select text.old_text '
'from page '
'left join revision on revision.rev_id = page.page_latest '
'left join text on text.old_id = revision.rev_text_id '
'where page.page_title = %s and page.page_namespace = %s '
'limit 1', query_attrs)
row = c.fetchone()
if row:
text = row[0]
return {'text': text}
def _attachments(self, page_id):
"""Yield path to nexe file attached to wiki page"""
c = self.connection().cursor()
c.execute('select il_to from imagelinks '
'where il_from = %s' % page_id)
for row in c:
name = row[0]
# mediawiki stores attachmets in subdirectories
# based on md5-hash of filename
# so we need to build path to file as follows
md5 = hashlib.md5(name).hexdigest()
path = os.path.join(self.options.attachments_dir,
md5[:1], md5[:2], name)
if os.path.isfile(path):
yield path
def extract(self):
self.extract_pages()
def extract_pages(self):
allura_base.log.info('Extracting pages...')
for page in self._pages():
self.extract_history(page)
self.extract_talk(page)
self.extract_attachments(page)
allura_base.log.info('Extracting pages done')
def extract_history(self, page):
page_id = page['page_id']
for page_data in self._history(page_id):
page_data.update(page)
self._save(json.dumps(page_data), 'pages', str(page_id),
'history', str(page_data['timestamp']) + '.json')
allura_base.log.info('Extracted history for page %s (%s)'
% (page_id, page['title']))
def extract_talk(self, page):
page_id = page['page_id']
talk_page_data = self._talk(page['title'])
if talk_page_data:
self._save(json.dumps(talk_page_data), 'pages', str(page_id),
'discussion.json')
allura_base.log.info('Extracted talk for page %s (%s)'
% (page_id, page['title']))
allura_base.log.info('No talk for page %s (%s)'
% (page_id, page['title']))
def extract_attachments(self, page):
page_id = page['page_id']
for filepath in self._attachments(page_id):
self._save_attachment(filepath, 'pages', str(page_id),
'attachments')
allura_base.log.info('Extracted attachments for page %s (%s)'
% (page_id, page['title']))