straw/SummaryParser.py

   1 """ Summaryparser.py
   2
   3 Wrapper module to feedparser and responsible for assigning data to Feed and
   4 SummaryItems.
   5 """
   6 __copyright__ = "Copyright (c) 2002-2005 Free Software Foundation, Inc."
   7 __license__ = """
   8 Straw is free software; you can redistribute it and/or modify it under the
   9 terms of the GNU General Public License as published by the Free Software
  10 Foundation; either version 2 of the License, or (at your option) any later
  11 version.
  12
  13 Straw is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
  15 A PARTICULAR PURPOSE. See the GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License along with
  18 this program; if not, write to the Free Software Foundation, Inc., 59 Temple
  19 Place - Suite 330, Boston, MA 02111-1307, USA. """
  20
  21
  22 from straw import helpers
  23 import HTMLParser
  24 import SummaryItem
  25 import copy
  26 import error
  27 import feedparser
  28 import htmlentitydefs
  29 import string
  30 import sys
  31 import time
  32 import types
  33
  34 class TitleImgParser(HTMLParser.HTMLParser):
  35     def __init__(self, feed=None):
  36         HTMLParser.HTMLParser.__init__(self)
  37         self._chars = []
  38         self._image_urls = []
  39         self._feed = feed
  40
  41     def set_feed(self, feed):
  42         self._feed = feed
  43
  44     def get_image_urls(self):
  45         return self._image_urls
  46
  47     def get_text(self, nchars=None):
  48         text = ''.join(self._chars).strip()
  49         if nchars:
  50             text = text[:nchars]
  51         return text
  52
  53     def close(self):
  54         self.flush()
  55         HTMLParser.HTMLParser.close(self)
  56
  57     def flush(self):
  58         del self._chars[:]
  59         #del self._image_urls[:]
  60
  61     def handle_starttag(self, tag, attrs):
  62         if tag == 'img':
  63             for name, value in attrs:
  64                 if name == 'src':
  65                     url = helpers.complete_url(value, self._feed.link)
  66                     self._image_urls.append(url)
  67         return
  68
  69     def handle_data(self, data):
  70         self._chars.append(data)
  71
  72     def handle_charref(self, ref):
  73         # called for each character reference, e.g. for '&#160;', ref will be '160'
  74         if not self._chars: return
  75         ref = ref.lower()
  76         if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'):
  77             text = '&#%s;' % ref
  78         else:
  79             if ref[0] == 'x':
  80                 c = int(ref[1:], 16)
  81             else:
  82                 c = int(ref)
  83             text = unichr(c).encode('utf-8')
  84         self._chars.append(text)
  85
  86     def handle_entityref(self, ref):
  87         # called for each entity reference, e.g. for '&copy;', ref will be 'copy'
  88         if not self._chars: return
  89         if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
  90             text = '&%s;' % ref
  91         else:
  92             # entity resolution graciously donated by Aaron Swartz
  93             def name2cp(k):
  94                 import htmlentitydefs
  95                 if hasattr(htmlentitydefs, 'name2codepoint'): # requires Python 2.3
  96                     return htmlentitydefs.name2codepoint[k]
  97                 k = htmlentitydefs.entitydefs[k]
  98                 if k.startswith('&#') and k.endswith(';'):
  99                     return int(k[2:-1]) # not in latin-1
 100                 return ord(k)
 101             try: name2cp(ref)
 102             except KeyError: text = '&%s;' % ref
 103             else: text = unichr(name2cp(ref)).encode('utf-8')
 104         self._chars.append(text)
 105
 106
 107
 108 def _remove_ids_if_duplicates(items):
 109     ids = {}
 110     duplicates = False
 111     for i in items:
 112         if i.guid is not None and i.guid != "":
 113             if ids.has_key(i.guid):
 114                 duplicates = True
 115                 break
 116             ids[i.guid] = True
 117     if duplicates:
 118         for i in items:
 119             i.guid = None
 120             i.guidislink = False
 121     return
 122
 123 def _to_unicode(text, encoding):
 124     if text and not isinstance(text, types.UnicodeType):
 125         text = unicode(text, encoding)
 126     return text
 127
 128 def feedparser_parse(data):
 129     pc = feedparser.parse(data)
 130     enc = pc.get('encoding', helpers.get_locale_encoding())
 131     if not enc:
 132         enc = sys.getdefaultencoding()
 133     return (pc, enc)
 134
 135 def parse_channel_info(parsed, parsed_content, encoding):
 136     parsed.title = _to_unicode(parsed_content.feed.get('title', ''), encoding)
 137     parsed.description = _to_unicode(parsed_content.feed.get('description', ''), encoding)
 138     parsed.link = _to_unicode(parsed_content.feed.get('link', ''), encoding)
 139     parsed.copyright = _to_unicode(parsed_content.feed.get('copyright', ''), encoding)
 140     parsed.last_build_date = parsed_content.feed.get('modified')
 141     parsed.creator = _to_unicode(parsed_content.feed.get('creator', ''), encoding)
 142     return parsed
 143
 144 def parse(content, feed = None, location = None):
 145     parsed_content, encoding = feedparser_parse(content)
 146
 147     if feed == None:
 148         from model import Feed
 149         feed = Feed()
 150         feed.location = location
 151         feed = parse_channel_info(feed, parsed_content, encoding)
 152
 153     for entry in parsed_content.entries:
 154         item = _parse_entry(entry, feed)
 155         feed.add_item(item)
 156
 157     _remove_ids_if_duplicates(feed.items)
 158
 159     return feed
 160
 161 def sanitize_content(data, feed, limit=60):
 162     images = None
 163     title = ""
 164     try:
 165         tp = TitleImgParser(feed)
 166         try:
 167             tp.feed(data)
 168             #images = [image for image in tp.get_image_urls()]
 169             #print tp.get_image_urls()
 170             images = tp.get_image_urls()
 171             title = tp.get_text(limit)
 172         except Exception, ex:
 173             error.log(ex)
 174     finally:
 175         tp.close()
 176     #print images
 177     return (title, images)
 178
 179 def _parse_entry(entry, feed):
 180     from model import Item
 181     item = Item()#SummaryItem.SummaryItem()
 182     item.feed = feed
 183     item.images = []
 184     content = []
 185     description = ""
 186     title = ""#_("No title")
 187
 188     if entry.has_key('content'):
 189         # it can have multiple content, so we just aggregate them for now.
 190         for c in entry.content:
 191             try:
 192                 if c.value not in content:
 193                     content.append(c.value)
 194             except TypeError, te:
 195                 error.log(te)
 196                 pass
 197
 198     if not len(content) and entry.has_key('summary'):
 199         content.append(entry.get('summary', ''))
 200
 201     description = "<br/>".join(content)
 202
 203     title = entry.get('title', '')
 204     if description:
 205         alttitle, item.images = sanitize_content(description, feed)
 206         #import ImageCache
 207         #[ImageCache.cache.add_refer(image, False, item) for image in images]
 208         if not title:
 209             pass
 210             # get the first MAXSPLIT words of the description and make that as our
 211             # title
 212             #dwords = string.splitfields(alttitle, maxsplit=6)
 213             #title = ' '.join(dwords[:]) + ' ...'
 214     title = title.replace('\n', '')
 215     item.title = title
 216
 217     item.description = description
 218     item.guidislink = entry.get('guidislink', False)
 219     item.link = entry.get('link', None)
 220     item.guid = entry.get('guid', None)
 221     item.creator = entry.get('author', None)
 222     item.contributors = entry.get('contributors', None)
 223     item.pub_date = entry.get('modified_parsed', time.strftime("%Y-%m-%d", time.localtime()))#time.localtime())
 224     item.license_urls.append(entry.get('license', None))
 225     item.fm_license = entry.get('fm_license', None)
 226     item.fm_changes = entry.get('fm_changes', None)
 227     item.publication_name = entry.get('prism_publicationname', None)
 228     item.publication_volume = entry.get('prism_volume', None)
 229     item.publication_number = entry.get('prism_number', None)
 230     item.publication_section = entry.get('prism_section', None)
 231     item.publication_starting_page = entry.get('prism_startingpage', None)
 232     item.enclosures = entry.get('enclosures', None)
 233
 234     #print item.pub_date
 235     #print time.strftime("%Y-%m-%d",time.localtime())
 236     item.pub_date = time.strftime("%Y-%m-%d %H:%M", time.localtime())
 237
 238     if entry.has_key('source'):
 239         url = entry.source.get('url', None)
 240         text = entry.source.get('value', None)
 241         if url and text:
 242             item.source = {'url': url,
 243                            'text': text}
 244         else:
 245             # There's no point displaying the source if there's no url in the
 246             # first place. This is a violation of the RSS 0.92 spec
 247             # http://backend.userland.com/rss092.
 248             item.source = None
 249
 250     return item