Fixed new feed items date handling, added date column to the item list, removed Summa...
[straw.git] / straw / SummaryParser.py
blobb38d33b07165b6440c41be27630424713119fdb6
1 """ Summaryparser.py
3 Wrapper module to feedparser and responsible for assigning data to Feed and
4 Items.
5 """
6 __copyright__ = "Copyright (c) 2002-2005 Free Software Foundation, Inc."
7 __license__ = """
8 Straw is free software; you can redistribute it and/or modify it under the
9 terms of the GNU General Public License as published by the Free Software
10 Foundation; either version 2 of the License, or (at your option) any later
11 version.
13 Straw is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
15 A PARTICULAR PURPOSE. See the GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License along with
18 this program; if not, write to the Free Software Foundation, Inc., 59 Temple
19 Place - Suite 330, Boston, MA 02111-1307, USA. """
21 from model import Item
22 from straw import helpers
23 import HTMLParser
24 import error
25 import feedparser
26 import htmlentitydefs
27 import sys
28 import time
29 import types
31 class TitleImgParser(HTMLParser.HTMLParser):
32 def __init__(self, feed=None):
33 HTMLParser.HTMLParser.__init__(self)
34 self._chars = []
35 self._image_urls = []
36 self._feed = feed
38 def set_feed(self, feed):
39 self._feed = feed
41 def get_image_urls(self):
42 return self._image_urls
44 def get_text(self, nchars=None):
45 text = ''.join(self._chars).strip()
46 if nchars:
47 text = text[:nchars]
48 return text
50 def close(self):
51 self.flush()
52 HTMLParser.HTMLParser.close(self)
54 def flush(self):
55 del self._chars[:]
56 #del self._image_urls[:]
58 def handle_starttag(self, tag, attrs):
59 if tag == 'img':
60 for name, value in attrs:
61 if name == 'src':
62 url = helpers.complete_url(value, self._feed.link)
63 self._image_urls.append(url)
64 return
66 def handle_data(self, data):
67 self._chars.append(data)
69 def handle_charref(self, ref):
70 # called for each character reference, e.g. for ' ', ref will be '160'
71 if not self._chars: return
72 ref = ref.lower()
73 if ref in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'):
74 text = '&#%s;' % ref
75 else:
76 if ref[0] == 'x':
77 c = int(ref[1:], 16)
78 else:
79 c = int(ref)
80 text = unichr(c).encode('utf-8')
81 self._chars.append(text)
83 def handle_entityref(self, ref):
84 # called for each entity reference, e.g. for '©', ref will be 'copy'
85 if not self._chars: return
86 if ref in ('lt', 'gt', 'quot', 'amp', 'apos'):
87 text = '&%s;' % ref
88 else:
89 # entity resolution graciously donated by Aaron Swartz
90 def name2cp(k):
91 if hasattr(htmlentitydefs, 'name2codepoint'): # requires Python 2.3
92 return htmlentitydefs.name2codepoint[k]
93 k = htmlentitydefs.entitydefs[k]
94 if k.startswith('&#') and k.endswith(';'):
95 return int(k[2:-1]) # not in latin-1
96 return ord(k)
97 try: name2cp(ref)
98 except KeyError: text = '&%s;' % ref
99 else: text = unichr(name2cp(ref)).encode('utf-8')
100 self._chars.append(text)
102 def _remove_ids_if_duplicates(items):
103 ids = {}
104 duplicates = False
105 for i in items:
106 if i.guid is not None and i.guid != "":
107 if ids.has_key(i.guid):
108 duplicates = True
109 break
110 ids[i.guid] = True
111 if duplicates:
112 for i in items:
113 i.guid = None
114 i.guidislink = False
115 return
117 def _to_unicode(text, encoding):
118 if text and not isinstance(text, types.UnicodeType):
119 text = unicode(text, encoding)
120 return text
122 def feedparser_parse(data):
123 pc = feedparser.parse(data)
124 enc = pc.get('encoding', helpers.get_locale_encoding())
125 if not enc:
126 enc = sys.getdefaultencoding()
127 return (pc, enc)
129 def parse_channel_info(parsed, parsed_content, encoding):
130 parsed.title = _to_unicode(parsed_content.feed.get('title', ''), encoding)
131 parsed.description = _to_unicode(parsed_content.feed.get('description', ''), encoding)
132 parsed.link = _to_unicode(parsed_content.feed.get('link', ''), encoding)
133 parsed.copyright = _to_unicode(parsed_content.feed.get('copyright', ''), encoding)
134 parsed.last_build_date = parsed_content.feed.get('modified')
135 parsed.creator = _to_unicode(parsed_content.feed.get('creator', ''), encoding)
136 return parsed
138 def parse(content, feed = None, location = None):
139 parsed_content, encoding = feedparser_parse(content)
141 if not feed:
142 from model import Feed
143 feed = Feed()
144 feed.location = location
145 feed = parse_channel_info(feed, parsed_content, encoding)
147 for entry in parsed_content.entries:
148 item = _parse_entry(entry, feed)
149 feed.add_item(item)
151 _remove_ids_if_duplicates(feed.items)
153 return feed
155 def sanitize_content(data, feed, limit=60):
156 images = None
157 title = ""
158 try:
159 tp = TitleImgParser(feed)
160 try:
161 tp.feed(data)
162 #images = [image for image in tp.get_image_urls()]
163 #print tp.get_image_urls()
164 images = tp.get_image_urls()
165 title = tp.get_text(limit)
166 except Exception, ex:
167 error.log(ex)
168 finally:
169 tp.close()
170 #print images
171 return (title, images)
173 def _parse_entry(entry, feed):
174 item = Item()
175 item.feed = feed
176 item.images = []
177 content = []
178 description = ""
179 title = ""#_("No title")
181 if entry.has_key('content'):
182 # it can have multiple content, so we just aggregate them for now.
183 for c in entry.content:
184 try:
185 if c.value not in content:
186 content.append(c.value)
187 except TypeError, te:
188 error.log(te)
189 pass
191 if not len(content) and entry.has_key('summary'):
192 content.append(entry.get('summary', ''))
194 description = "<br/>".join(content)
196 title = entry.get('title', '')
197 if description:
198 alttitle, item.images = sanitize_content(description, feed)
199 #import ImageCache
200 #[ImageCache.cache.add_refer(image, False, item) for image in images]
201 if not title:
202 pass
203 # get the first MAXSPLIT words of the description and make that as our
204 # title
205 #dwords = string.splitfields(alttitle, maxsplit=6)
206 #title = ' '.join(dwords[:]) + ' ...'
207 title = title.replace('\n', '')
208 item.title = title
210 item.description = description
211 item.guidislink = entry.get('guidislink', False)
212 item.link = entry.get('link', None)
213 item.guid = entry.get('guid', None)
214 item.creator = entry.get('author', None)
215 item.contributors = entry.get('contributors', None)
216 item.license_urls.append(entry.get('license', None))
217 item.fm_license = entry.get('fm_license', None)
218 item.fm_changes = entry.get('fm_changes', None)
219 item.publication_name = entry.get('prism_publicationname', None)
220 item.publication_volume = entry.get('prism_volume', None)
221 item.publication_number = entry.get('prism_number', None)
222 item.publication_section = entry.get('prism_section', None)
223 item.publication_starting_page = entry.get('prism_startingpage', None)
224 item.enclosures = entry.get('enclosures', None)
226 date_tuple = entry.get('updated_parsed', time.localtime())
227 item.pub_date = time.strftime("%Y-%m-%d %H:%M:%S", date_tuple)
229 if entry.has_key('source'):
230 url = entry.source.get('url', None)
231 text = entry.source.get('value', None)
232 if url and text:
233 item.source = {'url': url,
234 'text': text}
235 else:
236 # There's no point displaying the source if there's no url in the
237 # first place. This is a violation of the RSS 0.92 spec
238 # http://backend.userland.com/rss092.
239 item.source = None
241 return item