opensourceprojects.eu / osp-allura / [c87c03] /ForgeBlog/forgeblog/command/rssfeeds.py

[c87c03]: ForgeBlog / forgeblog / command / rssfeeds.py History

rssfeeds.py 205 lines (164 with data), 7.5 kB

from time import mktime
from datetime import datetime
from HTMLParser import HTMLParser

import feedparser
import html2text
from bson import ObjectId

import base
from allura.command import base as allura_base

from ming.orm import session
from pylons import c

from allura import model as M
from forgeblog import model as BM
from forgeblog import version
from forgeblog.main import ForgeBlogApp
from allura.lib import exceptions

html2text.BODY_WIDTH = 0

class MDHTMLParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.NO_END_TAGS = ["area", "base", "basefont", "br", "col", "frame",
                            "hr", "img", "input", "link", "meta", "param"]
        self.CUSTTAG_OPEN = u"[plain]"
        self.CUSTTAG_CLOSE = u"[/plain]"
        self.result_doc = u""
        self.custom_tag_opened = False

    def handle_starttag(self, tag, attrs):
        if self.custom_tag_opened:
            self.result_doc = u"%s%s" % (self.result_doc, self.CUSTTAG_CLOSE)
            self.custom_tag_opened = False

        tag_text = u"<%s" % tag
        for attr in attrs:
            if attr[1].find('"'):
                tag_text = u"%s %s='%s'" % (tag_text, attr[0], attr[1])
            else:
                tag_text = u'%s %s="%s"' % (tag_text, attr[0], attr[1])
        if tag not in self.NO_END_TAGS:
            tag_text = tag_text + ">"
        else:
            tag_text = tag_text + "/>"
        self.result_doc = u"%s%s" % (self.result_doc, tag_text)

    def handle_endtag(self, tag):
        if tag not in self.NO_END_TAGS:
            if self.custom_tag_opened:
                self.result_doc = u"%s%s" % (self.result_doc, self.CUSTTAG_CLOSE)
                self.custom_tag_opened = False

            self.result_doc = u"%s</%s>" % (self.result_doc, tag)

    def handle_data(self, data):
        res_data = ''

        for line in data.splitlines(True):
            # pre-emptive special case
            if not line or line.isspace():
                # don't wrap all whitespace lines
                res_data += line
                continue

            # open custom tag
            if not self.custom_tag_opened:
                res_data += self.CUSTTAG_OPEN
                self.custom_tag_opened = True
            # else: cust tag might be open already from previous incomplete data block

            # data
            res_data += line.rstrip('\r\n')  # strip EOL (add close tag before)

            # close custom tag
            if line.endswith(('\r','\n')):
                res_data += self.CUSTTAG_CLOSE + '\n'
                self.custom_tag_opened = False
            # else: no EOL could mean we're dealing with incomplete data block;
                # leave it open for next handle_data, handle_starttag, or handle_endtag to clean up

        self.result_doc += res_data

    def handle_comment(self, data):
        if self.custom_tag_opened:
            self.result_doc = u"%s%s" % (self.result_doc, self.CUSTTAG_CLOSE)
            self.custom_tag_opened = False

        self.result_doc = u"%s<!-- %s -->" % (self.result_doc, data)

    def handle_entityref(self, name):
        if not self.custom_tag_opened:
            self.result_doc = u"%s%s" % (self.result_doc, self.CUSTTAG_OPEN)
            self.custom_tag_opened = True

        self.result_doc = u"%s&%s;" % (self.result_doc, name)

    def handle_charref(self, name):
        if not self.custom_tag_opened:
            self.result_doc = u"%s%s" % (self.result_doc, self.CUSTTAG_OPEN)
            self.custom_tag_opened = True

        self.result_doc = u"%s&%s;" % (self.result_doc, name)

    def handle_decl(self, data):
        if self.custom_tag_opened:
            self.result_doc = u"%s%s" % (self.result_doc, self.CUSTTAG_CLOSE)
            self.custom_tag_opened = False

        self.result_doc = u"%s<!%s>" % (self.result_doc, data)

    def close(self):
        HTMLParser.close(self)

        if self.custom_tag_opened:
            self.result_doc = u"%s%s" % (self.result_doc, self.CUSTTAG_CLOSE)
            self.custom_tag_opened = False


class RssFeedsCommand(base.BlogCommand):
    summary = 'Rss feed client'
    parser = base.BlogCommand.standard_parser(verbose=True)
    parser.add_option('-a', '--appid', dest='appid', default='',
                      help='application id')
    parser.add_option('-u', '--username', dest='username', default='root',
                      help='poster username')

    def command(self):
        self.basic_setup()

        user = M.User.query.get(username=self.options.username)
        c.user = user

        self.prepare_feeds()
        for appid in self.feed_dict:
            for feed_url in self.feed_dict[appid]:
                self.process_feed(appid, feed_url)

    def prepare_feeds(self):
        feed_dict = {}
        if self.options.appid != '':
            gl_app = BM.Globals.query.get(app_config_id=ObjectId(self.options.appid))
            if not gl_app:
                raise exceptions.NoSuchGlobalsError("The globals %s " \
                     "could not be found in the database" % self.options.appid)
            if len(gl_app.external_feeds) > 0:
                feed_dict[gl_app.app_config_id] = gl_app.external_feeds
        else:
            for gl_app in BM.Globals.query.find().all():
                if len(gl_app.external_feeds) > 0:
                    feed_dict[gl_app.app_config_id] = gl_app.external_feeds
        self.feed_dict = feed_dict

    def process_feed(self, appid, feed_url):
        appconf = M.AppConfig.query.get(_id=appid)
        if not appconf:
            return

        c.project = appconf.project
        app = ForgeBlogApp(c.project, appconf)
        c.app = app

        allura_base.log.info("Get feed: %s" % feed_url)
        f = feedparser.parse(feed_url)
        if f.bozo:
            allura_base.log.exception("%s: %s" % (feed_url, f.bozo_exception))
            return
        for e in f.entries:
            title = e.title
            if 'content' in e:
                content = u''
                for ct in e.content:
                    if ct.type != 'text/html':
                        content += '[plain]%s[/plain]' % ct.value
                    else:
                        parser = MDHTMLParser()
                        parser.feed(ct.value)
                        parser.close() # must be before using the result_doc
                        markdown_content = html2text.html2text(parser.result_doc, baseurl=e.link)

                        content += markdown_content
            else:
                content = '[plain]%s[/plain]' % getattr(e, 'summary',
                                                    getattr(e, 'subtitle',
                                                        getattr(e, 'title')))

            content += u' [link](%s)' % e.link

            updated = datetime.utcfromtimestamp(mktime(e.updated_parsed))

            base_slug = BM.BlogPost.make_base_slug(title, updated)
            b_count = BM.BlogPost.query.find(dict(slug=base_slug, app_config_id=appid)).count()
            if b_count == 0:
                post = BM.BlogPost(title=title, text=content, timestamp=updated,
                               app_config_id=appid,
                               tool_version={'blog': version.__version__},
                               state='published')
                post.neighborhood_id=c.project.neighborhood_id
                post.make_slug()
                post.commit()

        session(BM.BlogPost).flush()