Implemented "mark all as read".
[straw.git] / straw / SummaryParser.py
blobe889c3a0deac301f214dea83d74099c247b814c7
1 """ Summaryparser.py
3 Wrapper module to feedparser and responsible for assigning data to Feed and
4 Items.
5 """
6 __copyright__ = "Copyright (c) 2002-2005 Free Software Foundation, Inc."
7 __license__ = """
8 Straw is free software; you can redistribute it and/or modify it under the
9 terms of the GNU General Public License as published by the Free Software
10 Foundation; either version 2 of the License, or (at your option) any later
11 version.
13 Straw is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
15 A PARTICULAR PURPOSE. See the GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License along with
18 this program; if not, write to the Free Software Foundation, Inc., 59 Temple
19 Place - Suite 330, Boston, MA 02111-1307, USA. """
21 from model import Item
22 from straw import helpers
23 import HTMLParser
24 import error
25 import feedparser
26 import htmlentitydefs
27 import sys
28 import time
29 import types
31 class TitleImgParser(HTMLParser.HTMLParser):
32 def __init__(self, feed=None):
33 HTMLParser.HTMLParser.__init__(self)
34 self._chars = []
35 self._image_urls = []
36 self._feed = feed
38 def set_feed(self, feed):
39 self._feed = feed
41 def get_image_urls(self):
42 return self._image_urls
44 def get_text(self, nchars=None):
45 text = ''.join(self._chars).strip()
46 if nchars:
47 text = text[:nchars]
48 return text
50 def close(self):
51 self.flush()
52 HTMLParser.HTMLParser.close(self)
54 def flush(self):
55 del self._chars[:]
56 #del self._image_urls[:]
58 def handle_starttag(self, tag, attrs):
59 if tag == 'img':
60 for name, value in attrs:
61 if name == 'src':
62 url = helpers.complete_url(value, self._feed.link)
63 self._image_urls.append(url)
64 return
66 def handle_data(self, data):
67 self._chars.append(data)
69 def handle_charref(self, ref):
70 # called for each character reference, e.g. for ' ', ref will be '160'
71 if not self._chars: return
72 ref = ref.lower()
73 if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'):
74 text = '&#%s;' % ref
75 else:
76 if ref[0] == 'x':
77 c = int(ref[1:], 16)
78 else:
79 c = int(ref)
80 text = unichr(c).encode('utf-8')
81 self._chars.append(text)
83 def handle_entityref(self, ref):
84 # called for each entity reference, e.g. for '©', ref will be 'copy'
85 if not self._chars: return
86 if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
87 text = '&%s;' % ref
88 else:
89 # entity resolution graciously donated by Aaron Swartz
90 def name2cp(k):
91 if hasattr(htmlentitydefs, 'name2codepoint'): # requires Python 2.3
92 return htmlentitydefs.name2codepoint[k]
93 k = htmlentitydefs.entitydefs[k]
94 if k.startswith('&#') and k.endswith(';'):
95 return int(k[2:-1]) # not in latin-1
96 return ord(k)
97 try: name2cp(ref)
98 except KeyError: text = '&%s;' % ref
99 else: text = unichr(name2cp(ref)).encode('utf-8')
100 self._chars.append(text)
102 def _remove_ids_if_duplicates(items):
103 ids = {}
104 duplicates = False
105 for i in items:
106 if i.guid is not None and i.guid != "":
107 if ids.has_key(i.guid):
108 duplicates = True
109 break
110 ids[i.guid] = True
111 if duplicates:
112 for i in items:
113 i.guid = None
114 i.guidislink = False
115 return
117 def _to_unicode(text, encoding):
118 if text and not isinstance(text, types.UnicodeType):
119 text = unicode(text, encoding)
120 return text
122 def feedparser_parse(data):
123 pc = feedparser.parse(data)
124 #print pc
125 enc = pc.get('encoding', helpers.get_locale_encoding())
126 if not enc:
127 enc = sys.getdefaultencoding()
128 return (pc, enc)
130 def parse_channel_info(feed, parsed_content, encoding):
131 feed.title = _to_unicode(parsed_content.feed.get('title', ''), encoding)
132 feed.description = _to_unicode(parsed_content.feed.get('description', ''), encoding)
133 feed.link = _to_unicode(parsed_content.feed.get('link', ''), encoding)
134 feed.copyright = _to_unicode(parsed_content.feed.get('copyright', ''), encoding)
135 feed.last_build_date = parsed_content.feed.get('modified')
136 feed.pdict["creator"] = _to_unicode(parsed_content.feed.get('creator', None), encoding)
137 feed.pdict["generator"] = _to_unicode(parsed_content.feed.get('generator', None), encoding)
138 feed.pdict["category"] = _to_unicode(parsed_content.feed.get('category', None), encoding)
139 feed.pdict["ttl"] = _to_unicode(parsed_content.feed.get('ttl', None), encoding)
140 feed.pdict["webmaster"] = _to_unicode(parsed_content.feed.get('publisher', None), encoding)
142 return feed
144 def parse(content, feed = None, location = None):
145 parsed_content, encoding = feedparser_parse(content)
146 #print parsed_content
148 if not feed:
149 from model import Feed
150 feed = Feed()
151 feed.location = location
153 parse_channel_info(feed, parsed_content, encoding)
155 for entry in parsed_content.entries:
156 item = _parse_entry(entry, feed)
157 feed.add_item(item)
159 _remove_ids_if_duplicates(feed.items)
161 return feed
163 def sanitize_content(data, feed, limit=60):
164 images = None
165 title = ""
166 try:
167 tp = TitleImgParser(feed)
168 try:
169 tp.feed(data)
170 #images = [image for image in tp.get_image_urls()]
171 #print tp.get_image_urls()
172 images = tp.get_image_urls()
173 title = tp.get_text(limit)
174 except Exception, ex:
175 error.log(ex)
176 finally:
177 tp.close()
178 #print images
179 return (title, images)
181 def _parse_entry(entry, feed):
182 item = Item()
183 item.feed = feed
184 item.images = []
185 content = []
186 description = ""
187 title = ""#_("No title")
189 if entry.has_key('content'):
190 # it can have multiple content, so we just aggregate them for now.
191 for c in entry.content:
192 try:
193 if c.value not in content:
194 content.append(c.value)
195 except TypeError, te:
196 error.log(te)
197 pass
199 if not len(content) and entry.has_key('summary'):
200 content.append(entry.get('summary', ''))
202 description = "<br/>".join(content)
204 title = entry.get('title', '')
205 if description:
206 alttitle, item.images = sanitize_content(description, feed)
207 #import ImageCache
208 #[ImageCache.cache.add_refer(image, False, item) for image in images]
209 if not title:
210 pass
211 # get the first MAXSPLIT words of the description and make that as our
212 # title
213 #dwords = string.splitfields(alttitle, maxsplit=6)
214 #title = ' '.join(dwords[:]) + ' ...'
215 title = title.replace('\n', '')
216 item.title = title
218 item.description = description
219 item.guidislink = entry.get('guidislink', False)
220 item.link = entry.get('link', None)
221 item.guid = entry.get('guid', None)
222 item.creator = entry.get('author', None)
223 item.contributors = entry.get('contributors', None)
224 item.license_urls.append(entry.get('license', None))
225 item.fm_license = entry.get('fm_license', None)
226 item.fm_changes = entry.get('fm_changes', None)
227 item.publication_name = entry.get('prism_publicationname', None)
228 item.publication_volume = entry.get('prism_volume', None)
229 item.publication_number = entry.get('prism_number', None)
230 item.publication_section = entry.get('prism_section', None)
231 item.publication_starting_page = entry.get('prism_startingpage', None)
232 item.enclosures = entry.get('enclosures', None)
234 date_tuple = entry.get('updated_parsed', time.localtime())
236 if date_tuple and len(date_tuple) == 9:
237 item.pub_date = time.strftime("%Y-%m-%d %H:%M:%S", date_tuple)
239 if entry.has_key('source'):
240 url = entry.source.get('url', None)
241 text = entry.source.get('value', None)
242 if url and text:
243 item.source = {'url': url,
244 'text': text}
245 else:
246 # There's no point displaying the source if there's no url in the
247 # first place. This is a violation of the RSS 0.92 spec
248 # http://backend.userland.com/rss092.
249 item.source = None
251 return item