Work on feed discovery dialog.
[straw/fork.git] / straw / SummaryParser.py
blob6789884e9e17bb20e345452bb2538af33000d673
1 """ Summaryparser.py
3 Wrapper module to feedparser and responsible for assigning data to Feed and
4 SummaryItems.
5 """
6 __copyright__ = "Copyright (c) 2002-2005 Free Software Foundation, Inc."
7 __license__ = """
8 Straw is free software; you can redistribute it and/or modify it under the
9 terms of the GNU General Public License as published by the Free Software
10 Foundation; either version 2 of the License, or (at your option) any later
11 version.
13 Straw is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
15 A PARTICULAR PURPOSE. See the GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License along with
18 this program; if not, write to the Free Software Foundation, Inc., 59 Temple
19 Place - Suite 330, Boston, MA 02111-1307, USA. """
22 from straw import helpers
23 import HTMLParser
24 import SummaryItem
25 import copy
26 import error
27 import feedparser
28 import htmlentitydefs
29 import string
30 import sys
31 import time
32 import types
34 class TitleImgParser(HTMLParser.HTMLParser):
35 def __init__(self, feed=None):
36 HTMLParser.HTMLParser.__init__(self)
37 self._chars = []
38 self._image_urls = []
39 self._feed = feed
41 def set_feed(self, feed):
42 self._feed = feed
44 def get_image_urls(self):
45 return self._image_urls
47 def get_text(self, nchars=None):
48 text = ''.join(self._chars).strip()
49 if nchars:
50 text = text[:nchars]
51 return text
53 def close(self):
54 self.flush()
55 HTMLParser.HTMLParser.close(self)
57 def flush(self):
58 del self._chars[:]
59 #del self._image_urls[:]
61 def handle_starttag(self, tag, attrs):
62 if tag == 'img':
63 for name, value in attrs:
64 if name == 'src':
65 url = helpers.complete_url(value, self._feed.link)
66 self._image_urls.append(url)
67 return
69 def handle_data(self, data):
70 self._chars.append(data)
72 def handle_charref(self, ref):
73 # called for each character reference, e.g. for ' ', ref will be '160'
74 if not self._chars: return
75 ref = ref.lower()
76 if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'):
77 text = '&#%s;' % ref
78 else:
79 if ref[0] == 'x':
80 c = int(ref[1:], 16)
81 else:
82 c = int(ref)
83 text = unichr(c).encode('utf-8')
84 self._chars.append(text)
86 def handle_entityref(self, ref):
87 # called for each entity reference, e.g. for '©', ref will be 'copy'
88 if not self._chars: return
89 if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
90 text = '&%s;' % ref
91 else:
92 # entity resolution graciously donated by Aaron Swartz
93 def name2cp(k):
94 import htmlentitydefs
95 if hasattr(htmlentitydefs, 'name2codepoint'): # requires Python 2.3
96 return htmlentitydefs.name2codepoint[k]
97 k = htmlentitydefs.entitydefs[k]
98 if k.startswith('&#') and k.endswith(';'):
99 return int(k[2:-1]) # not in latin-1
100 return ord(k)
101 try: name2cp(ref)
102 except KeyError: text = '&%s;' % ref
103 else: text = unichr(name2cp(ref)).encode('utf-8')
104 self._chars.append(text)
108 def _remove_ids_if_duplicates(items):
109 ids = {}
110 duplicates = False
111 for i in items:
112 if i.guid is not None and i.guid != "":
113 if ids.has_key(i.guid):
114 duplicates = True
115 break
116 ids[i.guid] = True
117 if duplicates:
118 for i in items:
119 i.guid = None
120 i.guidislink = False
121 return
123 def _to_unicode(text, encoding):
124 if text and not isinstance(text, types.UnicodeType):
125 text = unicode(text, encoding)
126 return text
128 def feedparser_parse(data):
129 pc = feedparser.parse(data)
130 enc = pc.get('encoding', helpers.get_locale_encoding())
131 if not enc:
132 enc = sys.getdefaultencoding()
133 return (pc, enc)
135 def parse_channel_info(parsed, parsed_content, encoding):
136 parsed.title = _to_unicode(parsed_content.feed.get('title', ''), encoding)
137 parsed.description = _to_unicode(parsed_content.feed.get('description', ''), encoding)
138 parsed.link = _to_unicode(parsed_content.feed.get('link', ''), encoding)
139 parsed.copyright = _to_unicode(parsed_content.feed.get('copyright', ''), encoding)
140 parsed.last_build_date = parsed_content.feed.get('modified')
141 parsed.creator = _to_unicode(parsed_content.feed.get('creator', ''), encoding)
142 return parsed
144 def parse(content, feed = None, location = None):
145 parsed_content, encoding = feedparser_parse(content)
147 if feed == None:
148 from model import Feed
149 feed = Feed()
150 feed.location = location
151 feed = parse_channel_info(feed, parsed_content, encoding)
153 for entry in parsed_content.entries:
154 item = _parse_entry(entry, feed)
155 feed.add_item(item)
157 _remove_ids_if_duplicates(feed.items)
159 return feed
161 def sanitize_content(data, feed, limit=60):
162 images = None
163 title = ""
164 try:
165 tp = TitleImgParser(feed)
166 try:
167 tp.feed(data)
168 #images = [image for image in tp.get_image_urls()]
169 #print tp.get_image_urls()
170 images = tp.get_image_urls()
171 title = tp.get_text(limit)
172 except Exception, ex:
173 error.log(ex)
174 finally:
175 tp.close()
176 #print images
177 return (title, images)
179 def _parse_entry(entry, feed):
180 from model import Item
181 item = Item()#SummaryItem.SummaryItem()
182 item.feed = feed
183 item.images = []
184 content = []
185 description = ""
186 title = ""#_("No title")
188 if entry.has_key('content'):
189 # it can have multiple content, so we just aggregate them for now.
190 for c in entry.content:
191 try:
192 if c.value not in content:
193 content.append(c.value)
194 except TypeError, te:
195 error.log(te)
196 pass
198 if not len(content) and entry.has_key('summary'):
199 content.append(entry.get('summary', ''))
201 description = "<br/>".join(content)
203 title = entry.get('title', '')
204 if description:
205 alttitle, item.images = sanitize_content(description, feed)
206 #import ImageCache
207 #[ImageCache.cache.add_refer(image, False, item) for image in images]
208 if not title:
209 pass
210 # get the first MAXSPLIT words of the description and make that as our
211 # title
212 #dwords = string.splitfields(alttitle, maxsplit=6)
213 #title = ' '.join(dwords[:]) + ' ...'
214 title = title.replace('\n', '')
215 item.title = title
217 item.description = description
218 item.guidislink = entry.get('guidislink', False)
219 item.link = entry.get('link', None)
220 item.guid = entry.get('guid', None)
221 item.creator = entry.get('author', None)
222 item.contributors = entry.get('contributors', None)
223 item.pub_date = entry.get('modified_parsed', time.strftime("%Y-%m-%d", time.localtime()))#time.localtime())
224 item.license_urls.append(entry.get('license', None))
225 item.fm_license = entry.get('fm_license', None)
226 item.fm_changes = entry.get('fm_changes', None)
227 item.publication_name = entry.get('prism_publicationname', None)
228 item.publication_volume = entry.get('prism_volume', None)
229 item.publication_number = entry.get('prism_number', None)
230 item.publication_section = entry.get('prism_section', None)
231 item.publication_starting_page = entry.get('prism_startingpage', None)
232 item.enclosures = entry.get('enclosures', None)
234 #print item.pub_date
235 #print time.strftime("%Y-%m-%d",time.localtime())
236 item.pub_date = time.strftime("%Y-%m-%d %H:%M", time.localtime())
238 if entry.has_key('source'):
239 url = entry.source.get('url', None)
240 text = entry.source.get('value', None)
241 if url and text:
242 item.source = {'url': url,
243 'text': text}
244 else:
245 # There's no point displaying the source if there's no url in the
246 # first place. This is a violation of the RSS 0.92 spec
247 # http://backend.userland.com/rss092.
248 item.source = None
250 return item