3 Wrapper module to feedparser and responsible for assigning data to Feed and
6 __copyright__
= "Copyright (c) 2002-2005 Free Software Foundation, Inc."
8 Straw is free software; you can redistribute it and/or modify it under the
9 terms of the GNU General Public License as published by the Free Software
10 Foundation; either version 2 of the License, or (at your option) any later
13 Straw is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
15 A PARTICULAR PURPOSE. See the GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License along with
18 this program; if not, write to the Free Software Foundation, Inc., 59 Temple
19 Place - Suite 330, Boston, MA 02111-1307, USA. """
22 from straw
import helpers
34 class TitleImgParser(HTMLParser
.HTMLParser
):
35 def __init__(self
, feed
=None):
36 HTMLParser
.HTMLParser
.__init
__(self
)
41 def set_feed(self
, feed
):
44 def get_image_urls(self
):
45 return self
._image
_urls
47 def get_text(self
, nchars
=None):
48 text
= ''.join(self
._chars
).strip()
55 HTMLParser
.HTMLParser
.close(self
)
59 #del self._image_urls[:]
61 def handle_starttag(self
, tag
, attrs
):
63 for name
, value
in attrs
:
65 url
= helpers
.complete_url(value
, self
._feed
.link
)
66 self
._image
_urls
.append(url
)
69 def handle_data(self
, data
):
70 self
._chars
.append(data
)
72 def handle_charref(self
, ref
):
73 # called for each character reference, e.g. for ' ', ref will be '160'
74 if not self
._chars
: return
76 if ref
in ('34', '38', '39', '60', '62', 'x22', 'x26', 'x27', 'x3c', 'x3e'):
83 text
= unichr(c
).encode('utf-8')
84 self
._chars
.append(text
)
86 def handle_entityref(self
, ref
):
87 # called for each entity reference, e.g. for '©', ref will be 'copy'
88 if not self
._chars
: return
89 if ref
in ('lt', 'gt', 'quot', 'amp', 'apos'):
92 # entity resolution graciously donated by Aaron Swartz
95 if hasattr(htmlentitydefs
, 'name2codepoint'): # requires Python 2.3
96 return htmlentitydefs
.name2codepoint
[k
]
97 k
= htmlentitydefs
.entitydefs
[k
]
98 if k
.startswith('&#') and k
.endswith(';'):
99 return int(k
[2:-1]) # not in latin-1
102 except KeyError: text
= '&%s;' % ref
103 else: text
= unichr(name2cp(ref
)).encode('utf-8')
104 self
._chars
.append(text
)
108 def _remove_ids_if_duplicates(items
):
112 if i
.guid
is not None and i
.guid
!= "":
113 if ids
.has_key(i
.guid
):
123 def _to_unicode(text
, encoding
):
124 if text
and not isinstance(text
, types
.UnicodeType
):
125 text
= unicode(text
, encoding
)
128 def feedparser_parse(data
):
129 pc
= feedparser
.parse(data
)
130 enc
= pc
.get('encoding', helpers
.get_locale_encoding())
132 enc
= sys
.getdefaultencoding()
135 def parse_channel_info(parsed
, parsed_content
, encoding
):
136 parsed
.title
= _to_unicode(parsed_content
.feed
.get('title', ''), encoding
)
137 parsed
.description
= _to_unicode(parsed_content
.feed
.get('description', ''), encoding
)
138 parsed
.link
= _to_unicode(parsed_content
.feed
.get('link', ''), encoding
)
139 parsed
.copyright
= _to_unicode(parsed_content
.feed
.get('copyright', ''), encoding
)
140 parsed
.last_build_date
= parsed_content
.feed
.get('modified')
141 parsed
.creator
= _to_unicode(parsed_content
.feed
.get('creator', ''), encoding
)
144 def parse(content
, feed
= None, location
= None):
145 parsed_content
, encoding
= feedparser_parse(content
)
148 from model
import Feed
150 feed
.location
= location
151 feed
= parse_channel_info(feed
, parsed_content
, encoding
)
153 for entry
in parsed_content
.entries
:
154 item
= _parse_entry(entry
, feed
)
157 _remove_ids_if_duplicates(feed
.items
)
161 def sanitize_content(data
, feed
, limit
=60):
165 tp
= TitleImgParser(feed
)
168 #images = [image for image in tp.get_image_urls()]
169 #print tp.get_image_urls()
170 images
= tp
.get_image_urls()
171 title
= tp
.get_text(limit
)
172 except Exception, ex
:
177 return (title
, images
)
179 def _parse_entry(entry
, feed
):
180 from model
import Item
181 item
= Item()#SummaryItem.SummaryItem()
186 title
= ""#_("No title")
188 if entry
.has_key('content'):
189 # it can have multiple content, so we just aggregate them for now.
190 for c
in entry
.content
:
192 if c
.value
not in content
:
193 content
.append(c
.value
)
194 except TypeError, te
:
198 if not len(content
) and entry
.has_key('summary'):
199 content
.append(entry
.get('summary', ''))
201 description
= "<br/>".join(content
)
203 title
= entry
.get('title', '')
205 alttitle
, item
.images
= sanitize_content(description
, feed
)
207 #[ImageCache.cache.add_refer(image, False, item) for image in images]
210 # get the first MAXSPLIT words of the description and make that as our
212 #dwords = string.splitfields(alttitle, maxsplit=6)
213 #title = ' '.join(dwords[:]) + ' ...'
214 title
= title
.replace('\n', '')
217 item
.description
= description
218 item
.guidislink
= entry
.get('guidislink', False)
219 item
.link
= entry
.get('link', None)
220 item
.guid
= entry
.get('guid', None)
221 item
.creator
= entry
.get('author', None)
222 item
.contributors
= entry
.get('contributors', None)
223 item
.pub_date
= entry
.get('modified_parsed', time
.strftime("%Y-%m-%d", time
.localtime()))#time.localtime())
224 item
.license_urls
.append(entry
.get('license', None))
225 item
.fm_license
= entry
.get('fm_license', None)
226 item
.fm_changes
= entry
.get('fm_changes', None)
227 item
.publication_name
= entry
.get('prism_publicationname', None)
228 item
.publication_volume
= entry
.get('prism_volume', None)
229 item
.publication_number
= entry
.get('prism_number', None)
230 item
.publication_section
= entry
.get('prism_section', None)
231 item
.publication_starting_page
= entry
.get('prism_startingpage', None)
232 item
.enclosures
= entry
.get('enclosures', None)
235 #print time.strftime("%Y-%m-%d",time.localtime())
236 item
.pub_date
= time
.strftime("%Y-%m-%d %H:%M", time
.localtime())
238 if entry
.has_key('source'):
239 url
= entry
.source
.get('url', None)
240 text
= entry
.source
.get('value', None)
242 item
.source
= {'url': url
,
245 # There's no point displaying the source if there's no url in the
246 # first place. This is a violation of the RSS 0.92 spec
247 # http://backend.userland.com/rss092.