straw/SummaryParser.py

   1 """ Summaryparser.py
   2
   3 Wrapper module to feedparser and responsible for assigning data to Feed and
   4 Items.
   5 """
   6 __copyright__ = "Copyright (c) 2002-2005 Free Software Foundation, Inc."
   7 __license__ = """
   8 Straw is free software; you can redistribute it and/or modify it under the
   9 terms of the GNU General Public License as published by the Free Software
  10 Foundation; either version 2 of the License, or (at your option) any later
  11 version.
  12
  13 Straw is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
  15 A PARTICULAR PURPOSE. See the GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License along with
  18 this program; if not, write to the Free Software Foundation, Inc., 59 Temple
  19 Place - Suite 330, Boston, MA 02111-1307, USA. """
  20
  21 from model import Item
  22 from straw import helpers
  23 import HTMLParser
  24 import error
  25 import feedparser
  26 import htmlentitydefs
  27 import sys
  28 import time
  29 import types
  30
  31 class TitleImgParser(HTMLParser.HTMLParser):
  32     def __init__(self, feed=None):
  33         HTMLParser.HTMLParser.__init__(self)
  34         self._chars = []
  35         self._image_urls = []
  36         self._feed = feed
  37
  38     def set_feed(self, feed):
  39         self._feed = feed
  40
  41     def get_image_urls(self):
  42         return self._image_urls
  43
  44     def get_text(self, nchars=None):
  45         text = ''.join(self._chars).strip()
  46         if nchars:
  47             text = text[:nchars]
  48         return text
  49
  50     def close(self):
  51         self.flush()
  52         HTMLParser.HTMLParser.close(self)
  53
  54     def flush(self):
  55         del self._chars[:]
  56         #del self._image_urls[:]
  57
  58     def handle_starttag(self, tag, attrs):
  59         if tag == 'img':
  60             for name, value in attrs:
  61                 if name == 'src':
  62                     url = helpers.complete_url(value, self._feed.link)
  63                     self._image_urls.append(url)
  64         return
  65
  66     def handle_data(self, data):
  67         self._chars.append(data)
  68
  69     def handle_charref(self, ref):
  70         # called for each character reference, e.g. for '&#160;', ref will be '160'
  71         if not self._chars: return
  72         ref = ref.lower()
  73         if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'):
  74             text = '&#%s;' % ref
  75         else:
  76             if ref[0] == 'x':
  77                 c = int(ref[1:], 16)
  78             else:
  79                 c = int(ref)
  80             text = unichr(c).encode('utf-8')
  81         self._chars.append(text)
  82
  83     def handle_entityref(self, ref):
  84         # called for each entity reference, e.g. for '&copy;', ref will be 'copy'
  85         if not self._chars: return
  86         if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
  87             text = '&%s;' % ref
  88         else:
  89             # entity resolution graciously donated by Aaron Swartz
  90             def name2cp(k):
  91                 if hasattr(htmlentitydefs, 'name2codepoint'): # requires Python 2.3
  92                     return htmlentitydefs.name2codepoint[k]
  93                 k = htmlentitydefs.entitydefs[k]
  94                 if k.startswith('&#') and k.endswith(';'):
  95                     return int(k[2:-1]) # not in latin-1
  96                 return ord(k)
  97             try: name2cp(ref)
  98             except KeyError: text = '&%s;' % ref
  99             else: text = unichr(name2cp(ref)).encode('utf-8')
 100         self._chars.append(text)
 101
 102 def _remove_ids_if_duplicates(items):
 103     ids = {}
 104     duplicates = False
 105     for i in items:
 106         if i.guid is not None and i.guid != "":
 107             if ids.has_key(i.guid):
 108                 duplicates = True
 109                 break
 110             ids[i.guid] = True
 111     if duplicates:
 112         for i in items:
 113             i.guid = None
 114             i.guidislink = False
 115     return
 116
 117 def _to_unicode(text, encoding):
 118     if text and not isinstance(text, types.UnicodeType):
 119         text = unicode(text, encoding)
 120     return text
 121
 122 def feedparser_parse(data):
 123     pc = feedparser.parse(data)
 124     enc = pc.get('encoding', helpers.get_locale_encoding())
 125     if not enc:
 126         enc = sys.getdefaultencoding()
 127     return (pc, enc)
 128
 129 def parse_channel_info(parsed, parsed_content, encoding):
 130     parsed.title = _to_unicode(parsed_content.feed.get('title', ''), encoding)
 131     parsed.description = _to_unicode(parsed_content.feed.get('description', ''), encoding)
 132     parsed.link = _to_unicode(parsed_content.feed.get('link', ''), encoding)
 133     parsed.copyright = _to_unicode(parsed_content.feed.get('copyright', ''), encoding)
 134     parsed.last_build_date = parsed_content.feed.get('modified')
 135     parsed.creator = _to_unicode(parsed_content.feed.get('creator', ''), encoding)
 136     return parsed
 137
 138 def parse(content, feed = None, location = None):
 139     parsed_content, encoding = feedparser_parse(content)
 140
 141     if not feed:
 142         from model import Feed
 143         feed = Feed()
 144         feed.location = location
 145         feed = parse_channel_info(feed, parsed_content, encoding)
 146
 147     for entry in parsed_content.entries:
 148         item = _parse_entry(entry, feed)
 149         feed.add_item(item)
 150
 151     _remove_ids_if_duplicates(feed.items)
 152
 153     return feed
 154
 155 def sanitize_content(data, feed, limit=60):
 156     images = None
 157     title = ""
 158     try:
 159         tp = TitleImgParser(feed)
 160         try:
 161             tp.feed(data)
 162             #images = [image for image in tp.get_image_urls()]
 163             #print tp.get_image_urls()
 164             images = tp.get_image_urls()
 165             title = tp.get_text(limit)
 166         except Exception, ex:
 167             error.log(ex)
 168     finally:
 169         tp.close()
 170     #print images
 171     return (title, images)
 172
 173 def _parse_entry(entry, feed):
 174     item = Item()
 175     item.feed = feed
 176     item.images = []
 177     content = []
 178     description = ""
 179     title = ""#_("No title")
 180
 181     if entry.has_key('content'):
 182         # it can have multiple content, so we just aggregate them for now.
 183         for c in entry.content:
 184             try:
 185                 if c.value not in content:
 186                     content.append(c.value)
 187             except TypeError, te:
 188                 error.log(te)
 189                 pass
 190
 191     if not len(content) and entry.has_key('summary'):
 192         content.append(entry.get('summary', ''))
 193
 194     description = "<br/>".join(content)
 195
 196     title = entry.get('title', '')
 197     if description:
 198         alttitle, item.images = sanitize_content(description, feed)
 199         #import ImageCache
 200         #[ImageCache.cache.add_refer(image, False, item) for image in images]
 201         if not title:
 202             pass
 203             # get the first MAXSPLIT words of the description and make that as our
 204             # title
 205             #dwords = string.splitfields(alttitle, maxsplit=6)
 206             #title = ' '.join(dwords[:]) + ' ...'
 207     title = title.replace('\n', '')
 208     item.title = title
 209
 210     item.description = description
 211     item.guidislink = entry.get('guidislink', False)
 212     item.link = entry.get('link', None)
 213     item.guid = entry.get('guid', None)
 214     item.creator = entry.get('author', None)
 215     item.contributors = entry.get('contributors', None)
 216     item.license_urls.append(entry.get('license', None))
 217     item.fm_license = entry.get('fm_license', None)
 218     item.fm_changes = entry.get('fm_changes', None)
 219     item.publication_name = entry.get('prism_publicationname', None)
 220     item.publication_volume = entry.get('prism_volume', None)
 221     item.publication_number = entry.get('prism_number', None)
 222     item.publication_section = entry.get('prism_section', None)
 223     item.publication_starting_page = entry.get('prism_startingpage', None)
 224     item.enclosures = entry.get('enclosures', None)
 225
 226     date_tuple = entry.get('updated_parsed', time.localtime())
 227     item.pub_date = time.strftime("%Y-%m-%d %H:%M:%S", date_tuple)
 228
 229     if entry.has_key('source'):
 230         url = entry.source.get('url', None)
 231         text = entry.source.get('value', None)
 232         if url and text:
 233             item.source = {'url': url,
 234                            'text': text}
 235         else:
 236             # There's no point displaying the source if there's no url in the
 237             # first place. This is a violation of the RSS 0.92 spec
 238             # http://backend.userland.com/rss092.
 239             item.source = None
 240
 241     return item