straw/SummaryParser.py

   1 """ Summaryparser.py
   2
   3 Wrapper module to feedparser and responsible for assigning data to Feed and
   4 SummaryItems.
   5 """
   6 __copyright__ = "Copyright (c) 2002-2005 Free Software Foundation, Inc."
   7 __license__ = """
   8 Straw is free software; you can redistribute it and/or modify it under the
   9 terms of the GNU General Public License as published by the Free Software
  10 Foundation; either version 2 of the License, or (at your option) any later
  11 version.
  12
  13 Straw is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
  15 A PARTICULAR PURPOSE. See the GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License along with
  18 this program; if not, write to the Free Software Foundation, Inc., 59 Temple
  19 Place - Suite 330, Boston, MA 02111-1307, USA. """
  20
  21
  22 from straw import helpers
  23 import HTMLParser
  24 import SummaryItem
  25 import copy
  26 import error
  27 import feedparser
  28 import htmlentitydefs
  29 import string
  30 import sys
  31 import time
  32 import types
  33
  34 class TitleImgParser(HTMLParser.HTMLParser):
  35     def __init__(self, feed=None):
  36         HTMLParser.HTMLParser.__init__(self)
  37         self._chars = []
  38         self._image_urls = []
  39         self._feed = feed
  40
  41     def set_feed(self, feed):
  42         self._feed = feed
  43
  44     def get_image_urls(self):
  45         return self._image_urls
  46
  47     def get_text(self, nchars=None):
  48         text = ''.join(self._chars).strip()
  49         if nchars:
  50             text = text[:nchars]
  51         return text
  52
  53     def close(self):
  54         self.flush()
  55         HTMLParser.HTMLParser.close(self)
  56
  57     def flush(self):
  58         del self._chars[:]
  59         #del self._image_urls[:]
  60
  61     def handle_starttag(self, tag, attrs):
  62         if tag == 'img':
  63             for name, value in attrs:
  64                 if name == 'src':
  65                     url = helpers.complete_url(value, self._feed.location)
  66                     self._image_urls.append(url)
  67         return
  68
  69     def handle_data(self, data):
  70         self._chars.append(data)
  71
  72     def handle_charref(self, ref):
  73         # called for each character reference, e.g. for '&#160;', ref will be '160'
  74         if not self._chars: return
  75         ref = ref.lower()
  76         if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'):
  77             text = '&#%s;' % ref
  78         else:
  79             if ref[0] == 'x':
  80                 c = int(ref[1:], 16)
  81             else:
  82                 c = int(ref)
  83             text = unichr(c).encode('utf-8')
  84         self._chars.append(text)
  85
  86     def handle_entityref(self, ref):
  87         # called for each entity reference, e.g. for '&copy;', ref will be 'copy'
  88         if not self._chars: return
  89         if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
  90             text = '&%s;' % ref
  91         else:
  92             # entity resolution graciously donated by Aaron Swartz
  93             def name2cp(k):
  94                 import htmlentitydefs
  95                 if hasattr(htmlentitydefs, 'name2codepoint'): # requires Python 2.3
  96                     return htmlentitydefs.name2codepoint[k]
  97                 k = htmlentitydefs.entitydefs[k]
  98                 if k.startswith('&#') and k.endswith(';'):
  99                     return int(k[2:-1]) # not in latin-1
 100                 return ord(k)
 101             try: name2cp(ref)
 102             except KeyError: text = '&%s;' % ref
 103             else: text = unichr(name2cp(ref)).encode('utf-8')
 104         self._chars.append(text)
 105
 106
 107
 108 def _remove_ids_if_duplicates(items):
 109     ids = {}
 110     duplicates = False
 111     for i in items:
 112         if i.guid is not None and i.guid != "":
 113             if ids.has_key(i.guid):
 114                 duplicates = True
 115                 break
 116             ids[i.guid] = True
 117     if duplicates:
 118         for i in items:
 119             i.guid = None
 120             i.guidislink = False
 121     return
 122
 123 def _to_unicode(text, encoding):
 124     if text and not isinstance(text, types.UnicodeType):
 125         text = unicode(text, encoding)
 126     return text
 127
 128 def feedparser_parse(data):
 129     pc = feedparser.parse(data)
 130     enc = pc.get('encoding', helpers.get_locale_encoding())
 131     if not enc:
 132         enc = sys.getdefaultencoding()
 133     return (pc, enc)
 134
 135 def parse_channel_info(parsed, parsed_content, encoding):
 136     parsed.title = _to_unicode(parsed_content.feed.get('title', ''), encoding)
 137     parsed.description = _to_unicode(parsed_content.feed.get('description', ''), encoding)
 138     parsed.location = _to_unicode(parsed_content.feed.get('link', ''), encoding)
 139     parsed.copyright = _to_unicode(parsed_content.feed.get('copyright', ''), encoding)
 140     parsed.last_build_date = parsed_content.feed.get('modified')
 141     parsed.creator = _to_unicode(parsed_content.feed.get('creator', ''), encoding)
 142     return parsed
 143
 144 def parse(content, feed = None):
 145     parsed_content, encoding = feedparser_parse(content)
 146
 147     if feed == None:
 148         from model import Feed
 149         feed = Feed()
 150         feed = parse_channel_info(feed, parsed_content, encoding)
 151
 152     for entry in parsed_content.entries:
 153         item = _parse_entry(entry, feed)
 154         feed.add_item(item)
 155
 156     _remove_ids_if_duplicates(feed.items)
 157
 158     return feed
 159
 160 def sanitize_content(data, feed, limit=60):
 161     images = None
 162     title = ""
 163     try:
 164         tp = TitleImgParser(feed)
 165         try:
 166             tp.feed(data)
 167             #images = [image for image in tp.get_image_urls()]
 168             #print tp.get_image_urls()
 169             images = tp.get_image_urls()
 170             title = tp.get_text(limit)
 171         except Exception, ex:
 172             error.log(ex)
 173     finally:
 174         tp.close()
 175     #print images
 176     return (title, images)
 177
 178 def _parse_entry(entry, feed):
 179     from model import Item
 180     item = Item()#SummaryItem.SummaryItem()
 181     item.feed = feed
 182     item.images = []
 183     content = []
 184     description = ""
 185     title = ""#_("No title")
 186
 187     if entry.has_key('content'):
 188         # it can have multiple content, so we just aggregate them for now.
 189         for c in entry.content:
 190             try:
 191                 if c.value not in content:
 192                     content.append(c.value)
 193             except TypeError, te:
 194                 error.log(te)
 195                 pass
 196
 197     if not len(content) and entry.has_key('summary'):
 198         content.append(entry.get('summary', ''))
 199
 200     description = "<br/>".join(content)
 201
 202     title = entry.get('title', '')
 203     if description:
 204         alttitle, item.images = sanitize_content(description, feed)
 205         #import ImageCache
 206         #[ImageCache.cache.add_refer(image, False, item) for image in images]
 207         if not title:
 208             pass
 209             # get the first MAXSPLIT words of the description and make that as our
 210             # title
 211             #dwords = string.splitfields(alttitle, maxsplit=6)
 212             #title = ' '.join(dwords[:]) + ' ...'
 213     title = title.replace('\n', '')
 214     item.title = title
 215
 216     item.description = description
 217     item.guidislink = entry.get('guidislink', False)
 218     item.link = entry.get('link', None)
 219     item.guid = entry.get('guid', None)
 220     item.creator = entry.get('author', None)
 221     item.contributors = entry.get('contributors', None)
 222     item.pub_date = entry.get('modified_parsed', time.strftime("%Y-%m-%d", time.localtime()))#time.localtime())
 223     item.license_urls.append(entry.get('license', None))
 224     item.fm_license = entry.get('fm_license', None)
 225     item.fm_changes = entry.get('fm_changes', None)
 226     item.publication_name = entry.get('prism_publicationname', None)
 227     item.publication_volume = entry.get('prism_volume', None)
 228     item.publication_number = entry.get('prism_number', None)
 229     item.publication_section = entry.get('prism_section', None)
 230     item.publication_starting_page = entry.get('prism_startingpage', None)
 231     item.enclosures = entry.get('enclosures', None)
 232
 233     #print item.pub_date
 234     #print time.strftime("%Y-%m-%d",time.localtime())
 235     item.pub_date = time.strftime("%Y-%m-%d %H:%M", time.localtime())
 236
 237     if entry.has_key('source'):
 238         url = entry.source.get('url', None)
 239         text = entry.source.get('value', None)
 240         if url and text:
 241             item.source = {'url': url,
 242                            'text': text}
 243         else:
 244             # There's no point displaying the source if there's no url in the
 245             # first place. This is a violation of the RSS 0.92 spec
 246             # http://backend.userland.com/rss092.
 247             item.source = None
 248
 249     return item