planet.py

   1 import shelve
   2
   3 from feedcache.cache import Cache
   4
   5 from django.template.defaultfilters import truncatewords_html
   6 from django.utils.encoding import smart_unicode
   7
   8 def TryEncoding(content):
   9     for body_charset in 'UTF-8', 'US-ASCII', 'ISO-8859-1', :
  10         try:
  11             return content.encode(body_charset)
  12         except UnicodeError:
  13             pass
  14         except Exception, e:
  15             if not type(content) is str:
  16                 pass
  17
  18
  19 def GetContent(feed):
  20     if hasattr(feed, "content"):
  21         return feed.content[0]["value"]
  22     else:
  23         return feed.summary
  24
  25
  26 class Parser:
  27     """
  28     "http://planet.gentoo.org/atom.xml"
  29     "http://overlays.gentoo.org/rss20.xml"
  30     "http://www.gentoo.org/rdf/en/gentoo-news.rdf"
  31     >>> f = Parser()
  32     >>> f.GetTitle()
  33     u'Planet Gentoo'
  34     >>> f.GetLink()
  35     u'http://planet.gentoo.org/'
  36     >>> for e in f: print e["title"], e["content"]
  37     """
  38
  39     def __init__(self, url, summary=False):
  40         storage = shelve.open("/home/timemachine/.feedcache")
  41         try :
  42             fc = Cache(storage)
  43             self.feed = fc.fetch(url)
  44             self.iterator = 0
  45             self.summary = summary
  46         finally:
  47             storage.close()
  48     def GetTitle(self):
  49         return self.feed.feed.title
  50
  51     def GetLink(self):
  52         return self.feed.feed.link
  53
  54     def next(self):
  55         if self.iterator >= len(self.feed.entries): raise StopIteration
  56
  57         entry = self.feed.entries[self.iterator]
  58
  59         title = TryEncoding(entry["title"])
  60         content = TryEncoding(GetContent(entry))
  61         link = entry.link
  62
  63         if self.summary:
  64             content = TryEncoding(truncatewords_html(content, 30))
  65             content = "".join((content , "..."))
  66
  67         entry = dict((("title", title), ("content", content), ("link", link)))
  68
  69         self.iterator += 1
  70         return entry
  71
  72     def __iter__(self):
  73         return self
  74
  75 def _test():
  76     """
  77     import doctest, ebuilds.ebuildfind.planet as planet
  78     return doctest.testmod(planet)
  79
  80     "http://news.google.fr/news?pz=1&ned=us&hl=en&q=gentoo+AND+(linux+OR+OS+OR+Operating+System+OR+GNU)&output=rss"
  81     "http://www.gentoo.org/rdf/en/gentoo-news.rdf"
  82     "http://planet.gentoo.org/atom.xml"
  83     "http://overlays.gentoo.org/rss20.xml"
  84     """
  85
  86     f = Parser("http://planet.gentoo.org/atom.xml", True)
  87
  88     print f.GetTitle()
  89     print f.GetLink()
  90
  91     for e in f:
  92         print e["title"]
  93         print e["content"]
  94         print e["link"]
  95
  96 if __name__ == "__main__":
  97     _test()