Work on feed discovery assistant.
[straw/fork.git] / straw / SummaryParser.py
blobd0cdffa0fe8763d5e434dc108e6d1ec540d83e2f
1 """ Summaryparser.py
3 Wrapper module to feedparser and responsible for assigning data to Feed and
4 SummaryItems.
5 """
6 __copyright__ = "Copyright (c) 2002-2005 Free Software Foundation, Inc."
7 __license__ = """
8 Straw is free software; you can redistribute it and/or modify it under the
9 terms of the GNU General Public License as published by the Free Software
10 Foundation; either version 2 of the License, or (at your option) any later
11 version.
13 Straw is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
15 A PARTICULAR PURPOSE. See the GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License along with
18 this program; if not, write to the Free Software Foundation, Inc., 59 Temple
19 Place - Suite 330, Boston, MA 02111-1307, USA. """
22 from straw import helpers
23 import HTMLParser
24 import SummaryItem
25 import copy
26 import error
27 import feedparser
28 import htmlentitydefs
29 import string
30 import sys
31 import time
32 import types
34 class TitleImgParser(HTMLParser.HTMLParser):
35 def __init__(self, feed=None):
36 HTMLParser.HTMLParser.__init__(self)
37 self._chars = []
38 self._image_urls = []
39 self._feed = feed
41 def set_feed(self, feed):
42 self._feed = feed
44 def get_image_urls(self):
45 return self._image_urls
47 def get_text(self, nchars=None):
48 text = ''.join(self._chars).strip()
49 if nchars:
50 text = text[:nchars]
51 return text
53 def close(self):
54 self.flush()
55 HTMLParser.HTMLParser.close(self)
57 def flush(self):
58 del self._chars[:]
59 #del self._image_urls[:]
61 def handle_starttag(self, tag, attrs):
62 if tag == 'img':
63 for name, value in attrs:
64 if name == 'src':
65 url = helpers.complete_url(value, self._feed.location)
66 self._image_urls.append(url)
67 return
69 def handle_data(self, data):
70 self._chars.append(data)
72 def handle_charref(self, ref):
73 # called for each character reference, e.g. for ' ', ref will be '160'
74 if not self._chars: return
75 ref = ref.lower()
76 if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'):
77 text = '&#%s;' % ref
78 else:
79 if ref[0] == 'x':
80 c = int(ref[1:], 16)
81 else:
82 c = int(ref)
83 text = unichr(c).encode('utf-8')
84 self._chars.append(text)
86 def handle_entityref(self, ref):
87 # called for each entity reference, e.g. for '©', ref will be 'copy'
88 if not self._chars: return
89 if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
90 text = '&%s;' % ref
91 else:
92 # entity resolution graciously donated by Aaron Swartz
93 def name2cp(k):
94 import htmlentitydefs
95 if hasattr(htmlentitydefs, 'name2codepoint'): # requires Python 2.3
96 return htmlentitydefs.name2codepoint[k]
97 k = htmlentitydefs.entitydefs[k]
98 if k.startswith('&#') and k.endswith(';'):
99 return int(k[2:-1]) # not in latin-1
100 return ord(k)
101 try: name2cp(ref)
102 except KeyError: text = '&%s;' % ref
103 else: text = unichr(name2cp(ref)).encode('utf-8')
104 self._chars.append(text)
108 def _remove_ids_if_duplicates(items):
109 ids = {}
110 duplicates = False
111 for i in items:
112 if i.guid is not None and i.guid != "":
113 if ids.has_key(i.guid):
114 duplicates = True
115 break
116 ids[i.guid] = True
117 if duplicates:
118 for i in items:
119 i.guid = None
120 i.guidislink = False
121 return
123 def _to_unicode(text, encoding):
124 if text and not isinstance(text, types.UnicodeType):
125 text = unicode(text, encoding)
126 return text
128 def feedparser_parse(data):
129 pc = feedparser.parse(data)
130 enc = pc.get('encoding', helpers.get_locale_encoding())
131 if not enc:
132 enc = sys.getdefaultencoding()
133 return (pc, enc)
135 def parse_channel_info(parsed, parsed_content, encoding):
136 parsed.title = _to_unicode(parsed_content.feed.get('title', ''), encoding)
137 parsed.description = _to_unicode(parsed_content.feed.get('description', ''), encoding)
138 parsed.location = _to_unicode(parsed_content.feed.get('link', ''), encoding)
139 parsed.copyright = _to_unicode(parsed_content.feed.get('copyright', ''), encoding)
140 parsed.last_build_date = parsed_content.feed.get('modified')
141 parsed.creator = _to_unicode(parsed_content.feed.get('creator', ''), encoding)
142 return parsed
144 def parse(content, feed = None):
145 parsed_content, encoding = feedparser_parse(content)
147 if feed == None:
148 from model import Feed
149 feed = Feed()
150 feed = parse_channel_info(feed, parsed_content, encoding)
152 for entry in parsed_content.entries:
153 item = _parse_entry(entry, feed)
154 feed.add_item(item)
156 _remove_ids_if_duplicates(feed.items)
158 return feed
160 def sanitize_content(data, feed, limit=60):
161 images = None
162 title = ""
163 try:
164 tp = TitleImgParser(feed)
165 try:
166 tp.feed(data)
167 #images = [image for image in tp.get_image_urls()]
168 #print tp.get_image_urls()
169 images = tp.get_image_urls()
170 title = tp.get_text(limit)
171 except Exception, ex:
172 error.log(ex)
173 finally:
174 tp.close()
175 #print images
176 return (title, images)
178 def _parse_entry(entry, feed):
179 from model import Item
180 item = Item()#SummaryItem.SummaryItem()
181 item.feed = feed
182 item.images = []
183 content = []
184 description = ""
185 title = ""#_("No title")
187 if entry.has_key('content'):
188 # it can have multiple content, so we just aggregate them for now.
189 for c in entry.content:
190 try:
191 if c.value not in content:
192 content.append(c.value)
193 except TypeError, te:
194 error.log(te)
195 pass
197 if not len(content) and entry.has_key('summary'):
198 content.append(entry.get('summary', ''))
200 description = "<br/>".join(content)
202 title = entry.get('title', '')
203 if description:
204 alttitle, item.images = sanitize_content(description, feed)
205 #import ImageCache
206 #[ImageCache.cache.add_refer(image, False, item) for image in images]
207 if not title:
208 pass
209 # get the first MAXSPLIT words of the description and make that as our
210 # title
211 #dwords = string.splitfields(alttitle, maxsplit=6)
212 #title = ' '.join(dwords[:]) + ' ...'
213 title = title.replace('\n', '')
214 item.title = title
216 item.description = description
217 item.guidislink = entry.get('guidislink', False)
218 item.link = entry.get('link', None)
219 item.guid = entry.get('guid', None)
220 item.creator = entry.get('author', None)
221 item.contributors = entry.get('contributors', None)
222 item.pub_date = entry.get('modified_parsed', time.strftime("%Y-%m-%d", time.localtime()))#time.localtime())
223 item.license_urls.append(entry.get('license', None))
224 item.fm_license = entry.get('fm_license', None)
225 item.fm_changes = entry.get('fm_changes', None)
226 item.publication_name = entry.get('prism_publicationname', None)
227 item.publication_volume = entry.get('prism_volume', None)
228 item.publication_number = entry.get('prism_number', None)
229 item.publication_section = entry.get('prism_section', None)
230 item.publication_starting_page = entry.get('prism_startingpage', None)
231 item.enclosures = entry.get('enclosures', None)
233 #print item.pub_date
234 #print time.strftime("%Y-%m-%d",time.localtime())
235 item.pub_date = time.strftime("%Y-%m-%d %H:%M", time.localtime())
237 if entry.has_key('source'):
238 url = entry.source.get('url', None)
239 text = entry.source.get('value', None)
240 if url and text:
241 item.source = {'url': url,
242 'text': text}
243 else:
244 # There's no point displaying the source if there's no url in the
245 # first place. This is a violation of the RSS 0.92 spec
246 # http://backend.userland.com/rss092.
247 item.source = None
249 return item