Parent: [c4f76a] (diff)

Child: [7a8f9a] (diff)

Download this file

rssfeeds.py    205 lines (164 with data), 7.5 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
from time import mktime
from datetime import datetime
from HTMLParser import HTMLParser
import feedparser
import html2text
from bson import ObjectId
import base
from allura.command import base as allura_base
from ming.orm import session
from pylons import c
from allura import model as M
from forgeblog import model as BM
from forgeblog import version
from forgeblog.main import ForgeBlogApp
from allura.lib import exceptions
html2text.BODY_WIDTH = 0
class MDHTMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.NO_END_TAGS = ["area", "base", "basefont", "br", "col", "frame",
"hr", "img", "input", "link", "meta", "param"]
self.CUSTTAG_OPEN = u"[plain]"
self.CUSTTAG_CLOSE = u"[/plain]"
self.result_doc = u""
self.custom_tag_opened = False
def handle_starttag(self, tag, attrs):
if self.custom_tag_opened:
self.result_doc = u"%s%s" % (self.result_doc, self.CUSTTAG_CLOSE)
self.custom_tag_opened = False
tag_text = u"<%s" % tag
for attr in attrs:
if attr[1].find('"'):
tag_text = u"%s %s='%s'" % (tag_text, attr[0], attr[1])
else:
tag_text = u'%s %s="%s"' % (tag_text, attr[0], attr[1])
if tag not in self.NO_END_TAGS:
tag_text = tag_text + ">"
else:
tag_text = tag_text + "/>"
self.result_doc = u"%s%s" % (self.result_doc, tag_text)
def handle_endtag(self, tag):
if tag not in self.NO_END_TAGS:
if self.custom_tag_opened:
self.result_doc = u"%s%s" % (self.result_doc, self.CUSTTAG_CLOSE)
self.custom_tag_opened = False
self.result_doc = u"%s</%s>" % (self.result_doc, tag)
def handle_data(self, data):
res_data = ''
for line in data.splitlines(True):
# pre-emptive special case
if not line or line.isspace():
# don't wrap all whitespace lines
res_data += line
continue
# open custom tag
if not self.custom_tag_opened:
res_data += self.CUSTTAG_OPEN
self.custom_tag_opened = True
# else: cust tag might be open already from previous incomplete data block
# data
res_data += line.rstrip('\r\n') # strip EOL (add close tag before)
# close custom tag
if line.endswith(('\r','\n')):
res_data += self.CUSTTAG_CLOSE + '\n'
self.custom_tag_opened = False
# else: no EOL could mean we're dealing with incomplete data block;
# leave it open for next handle_data, handle_starttag, or handle_endtag to clean up
self.result_doc += res_data
def handle_comment(self, data):
if self.custom_tag_opened:
self.result_doc = u"%s%s" % (self.result_doc, self.CUSTTAG_CLOSE)
self.custom_tag_opened = False
self.result_doc = u"%s<!-- %s -->" % (self.result_doc, data)
def handle_entityref(self, name):
if not self.custom_tag_opened:
self.result_doc = u"%s%s" % (self.result_doc, self.CUSTTAG_OPEN)
self.custom_tag_opened = True
self.result_doc = u"%s&%s;" % (self.result_doc, name)
def handle_charref(self, name):
if not self.custom_tag_opened:
self.result_doc = u"%s%s" % (self.result_doc, self.CUSTTAG_OPEN)
self.custom_tag_opened = True
self.result_doc = u"%s&%s;" % (self.result_doc, name)
def handle_decl(self, data):
if self.custom_tag_opened:
self.result_doc = u"%s%s" % (self.result_doc, self.CUSTTAG_CLOSE)
self.custom_tag_opened = False
self.result_doc = u"%s<!%s>" % (self.result_doc, data)
def close(self):
HTMLParser.close(self)
if self.custom_tag_opened:
self.result_doc = u"%s%s" % (self.result_doc, self.CUSTTAG_CLOSE)
self.custom_tag_opened = False
class RssFeedsCommand(base.BlogCommand):
summary = 'Rss feed client'
parser = base.BlogCommand.standard_parser(verbose=True)
parser.add_option('-a', '--appid', dest='appid', default='',
help='application id')
parser.add_option('-u', '--username', dest='username', default='root',
help='poster username')
def command(self):
self.basic_setup()
user = M.User.query.get(username=self.options.username)
c.user = user
self.prepare_feeds()
for appid in self.feed_dict:
for feed_url in self.feed_dict[appid]:
self.process_feed(appid, feed_url)
def prepare_feeds(self):
feed_dict = {}
if self.options.appid != '':
gl_app = BM.Globals.query.get(app_config_id=ObjectId(self.options.appid))
if not gl_app:
raise exceptions.NoSuchGlobalsError("The globals %s " \
"could not be found in the database" % self.options.appid)
if len(gl_app.external_feeds) > 0:
feed_dict[gl_app.app_config_id] = gl_app.external_feeds
else:
for gl_app in BM.Globals.query.find().all():
if len(gl_app.external_feeds) > 0:
feed_dict[gl_app.app_config_id] = gl_app.external_feeds
self.feed_dict = feed_dict
def process_feed(self, appid, feed_url):
appconf = M.AppConfig.query.get(_id=appid)
if not appconf:
return
c.project = appconf.project
app = ForgeBlogApp(c.project, appconf)
c.app = app
allura_base.log.info("Get feed: %s" % feed_url)
f = feedparser.parse(feed_url)
if f.bozo:
allura_base.log.exception("%s: %s" % (feed_url, f.bozo_exception))
return
for e in f.entries:
title = e.title
if 'content' in e:
content = u''
for ct in e.content:
if ct.type != 'text/html':
content += '[plain]%s[/plain]' % ct.value
else:
parser = MDHTMLParser()
parser.feed(ct.value)
parser.close() # must be before using the result_doc
markdown_content = html2text.html2text(parser.result_doc, baseurl=e.link)
content += markdown_content
else:
content = '[plain]%s[/plain]' % getattr(e, 'summary',
getattr(e, 'subtitle',
getattr(e, 'title')))
content += u' [link](%s)' % e.link
updated = datetime.utcfromtimestamp(mktime(e.updated_parsed))
base_slug = BM.BlogPost.make_base_slug(title, updated)
b_count = BM.BlogPost.query.find(dict(slug=base_slug, app_config_id=appid)).count()
if b_count == 0:
post = BM.BlogPost(title=title, text=content, timestamp=updated,
app_config_id=appid,
tool_version={'blog': version.__version__},
state='published')
post.neighborhood_id=c.project.neighborhood_id
post.make_slug()
post.commit()
session(BM.BlogPost).flush()