straw/SummaryParser.py

   1 """ Summaryparser.py
   2
   3 Wrapper module to feedparser and responsible for assigning data to Feed and
   4 Items.
   5 """
   6 __copyright__ = "Copyright (c) 2002-2005 Free Software Foundation, Inc."
   7 __license__ = """
   8 Straw is free software; you can redistribute it and/or modify it under the
   9 terms of the GNU General Public License as published by the Free Software
  10 Foundation; either version 2 of the License, or (at your option) any later
  11 version.
  12
  13 Straw is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
  15 A PARTICULAR PURPOSE. See the GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License along with
  18 this program; if not, write to the Free Software Foundation, Inc., 59 Temple
  19 Place - Suite 330, Boston, MA 02111-1307, USA. """
  20
  21 from model import Item
  22 from straw import helpers
  23 import HTMLParser
  24 import error
  25 import feedparser
  26 import htmlentitydefs
  27 import sys
  28 import time
  29 import types
  30
  31 class TitleImgParser(HTMLParser.HTMLParser):
  32     def __init__(self, feed=None):
  33         HTMLParser.HTMLParser.__init__(self)
  34         self._chars = []
  35         self._image_urls = []
  36         self._feed = feed
  37
  38     def set_feed(self, feed):
  39         self._feed = feed
  40
  41     def get_image_urls(self):
  42         return self._image_urls
  43
  44     def get_text(self, nchars=None):
  45         text = ''.join(self._chars).strip()
  46         if nchars:
  47             text = text[:nchars]
  48         return text
  49
  50     def close(self):
  51         self.flush()
  52         HTMLParser.HTMLParser.close(self)
  53
  54     def flush(self):
  55         del self._chars[:]
  56         #del self._image_urls[:]
  57
  58     def handle_starttag(self, tag, attrs):
  59         if tag == 'img':
  60             for name, value in attrs:
  61                 if name == 'src':
  62                     url = helpers.complete_url(value, self._feed.link)
  63                     self._image_urls.append(url)
  64         return
  65
  66     def handle_data(self, data):
  67         self._chars.append(data)
  68
  69     def handle_charref(self, ref):
  70         # called for each character reference, e.g. for '&#160;', ref will be '160'
  71         if not self._chars: return
  72         ref = ref.lower()
  73         if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'):
  74             text = '&#%s;' % ref
  75         else:
  76             if ref[0] == 'x':
  77                 c = int(ref[1:], 16)
  78             else:
  79                 c = int(ref)
  80             text = unichr(c).encode('utf-8')
  81         self._chars.append(text)
  82
  83     def handle_entityref(self, ref):
  84         # called for each entity reference, e.g. for '&copy;', ref will be 'copy'
  85         if not self._chars: return
  86         if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
  87             text = '&%s;' % ref
  88         else:
  89             # entity resolution graciously donated by Aaron Swartz
  90             def name2cp(k):
  91                 if hasattr(htmlentitydefs, 'name2codepoint'): # requires Python 2.3
  92                     return htmlentitydefs.name2codepoint[k]
  93                 k = htmlentitydefs.entitydefs[k]
  94                 if k.startswith('&#') and k.endswith(';'):
  95                     return int(k[2:-1]) # not in latin-1
  96                 return ord(k)
  97             try: name2cp(ref)
  98             except KeyError: text = '&%s;' % ref
  99             else: text = unichr(name2cp(ref)).encode('utf-8')
 100         self._chars.append(text)
 101
 102 def _remove_ids_if_duplicates(items):
 103     ids = {}
 104     duplicates = False
 105     for i in items:
 106         if i.guid is not None and i.guid != "":
 107             if ids.has_key(i.guid):
 108                 duplicates = True
 109                 break
 110             ids[i.guid] = True
 111     if duplicates:
 112         for i in items:
 113             i.guid = None
 114             i.guidislink = False
 115     return
 116
 117 def _to_unicode(text, encoding):
 118     if text and not isinstance(text, types.UnicodeType):
 119         text = unicode(text, encoding)
 120     return text
 121
 122 def feedparser_parse(data):
 123     pc = feedparser.parse(data)
 124     #print pc
 125     enc = pc.get('encoding', helpers.get_locale_encoding())
 126     if not enc:
 127         enc = sys.getdefaultencoding()
 128     return (pc, enc)
 129
 130 def parse_channel_info(feed, parsed_content, encoding):
 131     feed.title = _to_unicode(parsed_content.feed.get('title', ''), encoding)
 132     feed.description = _to_unicode(parsed_content.feed.get('description', ''), encoding)
 133     feed.link = _to_unicode(parsed_content.feed.get('link', ''), encoding)
 134     feed.copyright = _to_unicode(parsed_content.feed.get('copyright', ''), encoding)
 135     feed.last_build_date = parsed_content.feed.get('modified')
 136     feed.pdict["creator"] = _to_unicode(parsed_content.feed.get('creator', None), encoding)
 137     feed.pdict["generator"] = _to_unicode(parsed_content.feed.get('generator', None), encoding)
 138     feed.pdict["category"] = _to_unicode(parsed_content.feed.get('category', None), encoding)
 139     feed.pdict["ttl"] = _to_unicode(parsed_content.feed.get('ttl', None), encoding)
 140     feed.pdict["webmaster"] = _to_unicode(parsed_content.feed.get('publisher', None), encoding)
 141
 142     return feed
 143
 144 def parse(content, feed = None, location = None):
 145     parsed_content, encoding = feedparser_parse(content)
 146     #print parsed_content
 147
 148     if not feed:
 149         from model import Feed
 150         feed = Feed()
 151         feed.location = location
 152
 153     parse_channel_info(feed, parsed_content, encoding)
 154
 155     for entry in parsed_content.entries:
 156         item = _parse_entry(entry, feed)
 157         feed.add_item(item)
 158
 159     _remove_ids_if_duplicates(feed.items)
 160
 161     return feed
 162
 163 def sanitize_content(data, feed, limit=60):
 164     images = None
 165     title = ""
 166     try:
 167         tp = TitleImgParser(feed)
 168         try:
 169             tp.feed(data)
 170             #images = [image for image in tp.get_image_urls()]
 171             #print tp.get_image_urls()
 172             images = tp.get_image_urls()
 173             title = tp.get_text(limit)
 174         except Exception, ex:
 175             error.log(ex)
 176     finally:
 177         tp.close()
 178     #print images
 179     return (title, images)
 180
 181 def _parse_entry(entry, feed):
 182     item = Item()
 183     item.feed = feed
 184     item.images = []
 185     content = []
 186     description = ""
 187     title = ""#_("No title")
 188
 189     if entry.has_key('content'):
 190         # it can have multiple content, so we just aggregate them for now.
 191         for c in entry.content:
 192             try:
 193                 if c.value not in content:
 194                     content.append(c.value)
 195             except TypeError, te:
 196                 error.log(te)
 197                 pass
 198
 199     if not len(content) and entry.has_key('summary'):
 200         content.append(entry.get('summary', ''))
 201
 202     description = "<br/>".join(content)
 203
 204     title = entry.get('title', '')
 205     if description:
 206         alttitle, item.images = sanitize_content(description, feed)
 207         #import ImageCache
 208         #[ImageCache.cache.add_refer(image, False, item) for image in images]
 209         if not title:
 210             pass
 211             # get the first MAXSPLIT words of the description and make that as our
 212             # title
 213             #dwords = string.splitfields(alttitle, maxsplit=6)
 214             #title = ' '.join(dwords[:]) + ' ...'
 215     title = title.replace('\n', '')
 216     item.title = title
 217
 218     item.description = description
 219     item.guidislink = entry.get('guidislink', False)
 220     item.link = entry.get('link', None)
 221     item.guid = entry.get('guid', None)
 222     item.creator = entry.get('author', None)
 223     item.contributors = entry.get('contributors', None)
 224     item.license_urls.append(entry.get('license', None))
 225     item.fm_license = entry.get('fm_license', None)
 226     item.fm_changes = entry.get('fm_changes', None)
 227     item.publication_name = entry.get('prism_publicationname', None)
 228     item.publication_volume = entry.get('prism_volume', None)
 229     item.publication_number = entry.get('prism_number', None)
 230     item.publication_section = entry.get('prism_section', None)
 231     item.publication_starting_page = entry.get('prism_startingpage', None)
 232     item.enclosures = entry.get('enclosures', None)
 233
 234     date_tuple = entry.get('updated_parsed', time.localtime())
 235
 236     if date_tuple and len(date_tuple) == 9:
 237         item.pub_date = time.strftime("%Y-%m-%d %H:%M:%S", date_tuple)
 238
 239     if entry.has_key('source'):
 240         url = entry.source.get('url', None)
 241         text = entry.source.get('value', None)
 242         if url and text:
 243             item.source = {'url': url,
 244                            'text': text}
 245         else:
 246             # There's no point displaying the source if there's no url in the
 247             # first place. This is a violation of the RSS 0.92 spec
 248             # http://backend.userland.com/rss092.
 249             item.source = None
 250
 251     return item