Source code for weblib.feed

# Copyright: 2012, Grigoriy Petukhov
# Author: Grigoriy Petukhov (http://lorien.name)
# License: BSD
import logging
from hashlib import sha1
from time import mktime
from datetime import datetime
import feedparser
from lxml.html.clean import clean_html

from weblib.etree import truncate_html
from weblib.html import strip_tags
from weblib.text import remove_bom

log = logging.getLogger('weblib.feed')


def parse_entry_date(entry):
    date_fields = ('published', 'created', 'updated', 'modified')

    for key in date_fields:
        value = getattr(entry, '%s_parsed' % key, None)
        if value:
            return datetime.fromtimestamp(mktime(value))

    raise Exception('Could not parse date of entry %s' % entry.link)


[docs]def parse_entry_tags(entry): """Return a list of tag objects of the entry""" tags = set() for tag in entry.get('tags', []): term = tag.get('label') or tag.get('term') or '' for item in term.split(','): item = item.strip().lower() if item: tags.add(item) return list(tags)
def parse_entry_content(entry): body = '' if hasattr(entry, 'content'): mapping = dict((x.type, x.value) for x in entry.content) if 'text/html' in mapping: body = mapping['text/html'] elif 'application/xhtml+xml' in mapping: body = mapping['application/xhtml+xml'] else: body = list(mapping.values())[0] if hasattr(entry, 'summary') and len(entry.summary) > len(body): body = entry.summary if hasattr(entry, 'description') and len(entry.description) > len(body): body = entry.description return body def parse_entry_teaser(entry, size): content = truncate_html(parse_entry_content(entry), size) return content def build_entry_content(entry, teaser=False, teaser_size=None): content = clean_html(parse_entry_content(entry)) if teaser: content = truncate_html(content, teaser_size) return content
[docs]def parse_feed(grab, teaser_size=1000): """ Extract details of feed fetched with Grab. Returns dict with keys: * feed * entries """ # BOM removing is required because without it # sometimes feedparser just raise SegmentationFault o_O feed = feedparser.parse(remove_bom(grab.response.body)) entries = [] for entry in feed.entries: try: entries.append(parse_entry(entry, feed, teaser_size=teaser_size)) except Exception as ex: log.error('Entry parsing error', exc_info=ex) return {'feed': feed, 'entries': entries}
def parse_entry(entry, feed, teaser_size): details = { 'url': entry.link, 'title': strip_tags(entry.title), 'content': build_entry_content(entry), 'teaser': build_entry_content(entry, teaser=True, teaser_size=teaser_size), 'date': parse_entry_date(entry), 'tags': parse_entry_tags(entry), } guid_token = (entry.get('id') or entry.link).encode('utf-8') details['guid'] = sha1(guid_token).hexdigest() if not details['date']: raise Exception('Entry %s does not has publication date' % entry.link) return details