3 Wrapper module to feedparser and responsible for assigning data to Feed and
6 __copyright__
= "Copyright (c) 2002-2005 Free Software Foundation, Inc."
8 Straw is free software; you can redistribute it and/or modify it under the
9 terms of the GNU General Public License as published by the Free Software
10 Foundation; either version 2 of the License, or (at your option) any later
13 Straw is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
15 A PARTICULAR PURPOSE. See the GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License along with
18 this program; if not, write to the Free Software Foundation, Inc., 59 Temple
19 Place - Suite 330, Boston, MA 02111-1307, USA. """
21 from model
import Item
22 from straw
import helpers
31 class TitleImgParser(HTMLParser
.HTMLParser
):
32 def __init__(self
, feed
=None):
33 HTMLParser
.HTMLParser
.__init
__(self
)
38 def set_feed(self
, feed
):
41 def get_image_urls(self
):
42 return self
._image
_urls
44 def get_text(self
, nchars
=None):
45 text
= ''.join(self
._chars
).strip()
52 HTMLParser
.HTMLParser
.close(self
)
56 #del self._image_urls[:]
58 def handle_starttag(self
, tag
, attrs
):
60 for name
, value
in attrs
:
62 url
= helpers
.complete_url(value
, self
._feed
.link
)
63 self
._image
_urls
.append(url
)
66 def handle_data(self
, data
):
67 self
._chars
.append(data
)
69 def handle_charref(self
, ref
):
70 # called for each character reference, e.g. for ' ', ref will be '160'
71 if not self
._chars
: return
73 if ref
in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'):
80 text
= unichr(c
).encode('utf-8')
81 self
._chars
.append(text
)
83 def handle_entityref(self
, ref
):
84 # called for each entity reference, e.g. for '©', ref will be 'copy'
85 if not self
._chars
: return
86 if ref
in ('lt', 'gt', 'quot', 'amp', 'apos'):
89 # entity resolution graciously donated by Aaron Swartz
91 if hasattr(htmlentitydefs
, 'name2codepoint'): # requires Python 2.3
92 return htmlentitydefs
.name2codepoint
[k
]
93 k
= htmlentitydefs
.entitydefs
[k
]
94 if k
.startswith('&#') and k
.endswith(';'):
95 return int(k
[2:-1]) # not in latin-1
98 except KeyError: text
= '&%s;' % ref
99 else: text
= unichr(name2cp(ref
)).encode('utf-8')
100 self
._chars
.append(text
)
102 def _remove_ids_if_duplicates(items
):
106 if i
.guid
is not None and i
.guid
!= "":
107 if ids
.has_key(i
.guid
):
117 def _to_unicode(text
, encoding
):
118 if text
and not isinstance(text
, types
.UnicodeType
):
119 text
= unicode(text
, encoding
)
122 def feedparser_parse(data
):
123 pc
= feedparser
.parse(data
)
125 enc
= pc
.get('encoding', helpers
.get_locale_encoding())
127 enc
= sys
.getdefaultencoding()
130 def parse_channel_info(feed
, parsed_content
, encoding
):
131 feed
.title
= _to_unicode(parsed_content
.feed
.get('title', ''), encoding
)
132 feed
.description
= _to_unicode(parsed_content
.feed
.get('description', ''), encoding
)
133 feed
.link
= _to_unicode(parsed_content
.feed
.get('link', ''), encoding
)
134 feed
.copyright
= _to_unicode(parsed_content
.feed
.get('copyright', ''), encoding
)
135 feed
.last_build_date
= parsed_content
.feed
.get('modified')
136 feed
.pdict
["creator"] = _to_unicode(parsed_content
.feed
.get('creator', None), encoding
)
137 feed
.pdict
["generator"] = _to_unicode(parsed_content
.feed
.get('generator', None), encoding
)
138 feed
.pdict
["category"] = _to_unicode(parsed_content
.feed
.get('category', None), encoding
)
139 feed
.pdict
["ttl"] = _to_unicode(parsed_content
.feed
.get('ttl', None), encoding
)
140 feed
.pdict
["webmaster"] = _to_unicode(parsed_content
.feed
.get('publisher', None), encoding
)
144 def parse(content
, feed
= None, location
= None):
145 parsed_content
, encoding
= feedparser_parse(content
)
146 #print parsed_content
149 from model
import Feed
151 feed
.location
= location
153 parse_channel_info(feed
, parsed_content
, encoding
)
155 for entry
in parsed_content
.entries
:
156 item
= _parse_entry(entry
, feed
)
159 _remove_ids_if_duplicates(feed
.items
)
163 def sanitize_content(data
, feed
, limit
=60):
167 tp
= TitleImgParser(feed
)
170 #images = [image for image in tp.get_image_urls()]
171 #print tp.get_image_urls()
172 images
= tp
.get_image_urls()
173 title
= tp
.get_text(limit
)
174 except Exception, ex
:
179 return (title
, images
)
181 def _parse_entry(entry
, feed
):
187 title
= ""#_("No title")
189 if entry
.has_key('content'):
190 # it can have multiple content, so we just aggregate them for now.
191 for c
in entry
.content
:
193 if c
.value
not in content
:
194 content
.append(c
.value
)
195 except TypeError, te
:
199 if not len(content
) and entry
.has_key('summary'):
200 content
.append(entry
.get('summary', ''))
202 description
= "<br/>".join(content
)
204 title
= entry
.get('title', '')
206 alttitle
, item
.images
= sanitize_content(description
, feed
)
208 #[ImageCache.cache.add_refer(image, False, item) for image in images]
211 # get the first MAXSPLIT words of the description and make that as our
213 #dwords = string.splitfields(alttitle, maxsplit=6)
214 #title = ' '.join(dwords[:]) + ' ...'
215 title
= title
.replace('\n', '')
218 item
.description
= description
219 item
.guidislink
= entry
.get('guidislink', False)
220 item
.link
= entry
.get('link', None)
221 item
.guid
= entry
.get('guid', None)
222 item
.creator
= entry
.get('author', None)
223 item
.contributors
= entry
.get('contributors', None)
224 item
.license_urls
.append(entry
.get('license', None))
225 item
.fm_license
= entry
.get('fm_license', None)
226 item
.fm_changes
= entry
.get('fm_changes', None)
227 item
.publication_name
= entry
.get('prism_publicationname', None)
228 item
.publication_volume
= entry
.get('prism_volume', None)
229 item
.publication_number
= entry
.get('prism_number', None)
230 item
.publication_section
= entry
.get('prism_section', None)
231 item
.publication_starting_page
= entry
.get('prism_startingpage', None)
232 item
.enclosures
= entry
.get('enclosures', None)
234 date_tuple
= entry
.get('updated_parsed', time
.localtime())
236 if date_tuple
and len(date_tuple
) == 9:
237 item
.pub_date
= time
.strftime("%Y-%m-%d %H:%M:%S", date_tuple
)
239 if entry
.has_key('source'):
240 url
= entry
.source
.get('url', None)
241 text
= entry
.source
.get('value', None)
243 item
.source
= {'url': url
,
246 # There's no point displaying the source if there's no url in the
247 # first place. This is a violation of the RSS 0.92 spec
248 # http://backend.userland.com/rss092.