straw/opml.py

   1 """ OPML.py
   2
   3 """
   4
   5 __copyright__ = "Copyright (c) 2002-2005 Free Software Foundation, Inc."
   6 __author__  = "Juri Pakaste <juri@iki.fi>"
   7 __license__ = """
   8 Straw is free software; you can redistribute it and/or modify it under the
   9 terms of the GNU General Public License as published by the Free Software
  10 Foundation; either version 2 of the License, or (at your option) any later
  11 version.
  12
  13 Straw is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
  15 A PARTICULAR PURPOSE. See the GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License along with
  18 this program; if not, write to the Free Software Foundation, Inc., 59 Temple
  19 Place - Suite 330, Boston, MA 02111-1307, USA. """
  20
  21 from Fetcher import FetchTask
  22 from straw.JobManager import Job, TaskThread, ThreadPoolJobHandler
  23 from straw.model import Category, Feed
  24 from threading import Lock
  25 from xml.sax import saxutils, make_parser, SAXParseException
  26 from xml.sax.handler import feature_namespaces, feature_namespace_prefixes
  27 from xml.sax.saxutils import XMLGenerator
  28 from xml.sax.xmlreader import AttributesImpl
  29 import Fetcher
  30 import gnomevfs
  31 import straw.JobManager as JobManager
  32 import sys
  33 import time
  34 import xml.sax._exceptions
  35 import xml.sax.handler
  36
  37 lock = Lock()
  38
  39 class OPMLParseJobHandler(ThreadPoolJobHandler):
  40     job_id = "opml-parse"
  41
  42     def __init__(self, job):
  43         ThreadPoolJobHandler.__init__(self, job)
  44
  45         self.pool_size = 1
  46         self.task_class = OPMLParseTaskThread
  47
  48     def _on_url_fetched(self, handler, task_result):
  49         self.task_queue.put((task_result.task.user_data, task_result.result))
  50
  51     def _prepare(self):
  52         category = self.job.data[1]
  53         url = self.job.data[0]
  54         self.job_size = 1
  55         fetch_tasks = [FetchTask(url = url, user_data = category)]
  56         observers = [{ "task-done": [ self._on_url_fetched ]}]
  57         Fetcher.fetch(fetch_tasks, observers = observers)
  58
  59 class OPMLParseTaskThread(TaskThread):
  60     def __init__(self, handler):
  61         TaskThread.__init__(self, handler)
  62
  63     def _process(self, task):
  64         opml = read(task[1])
  65         tree = self._build_tree(opml.roots(), parent = task[0])
  66         return tree
  67
  68     def _build_tree(self, outlines, parent = None):
  69         save_list = []
  70         i = 0
  71
  72         for outline in outlines:
  73             if not outline.has_key("type"):
  74                 # Some feeds exported from Liferea don't have "type" attribute.
  75                 outline["type"] = "rss"
  76
  77             if outline["type"] == "folder" or len(outline.children) > 0:
  78                 category = Category()
  79                 category.norder = i
  80                 category.name = outline["text"]
  81                 category.parent = parent
  82
  83                 save_list.append(category)
  84
  85                 if not outline.children:
  86                     continue
  87
  88                 save_list.extend(self._build_tree(outline.children, category))
  89             else:
  90                 feed = Feed()
  91                 feed.norder = i
  92
  93                 if outline.has_key("title"):
  94                     feed.title = outline["title"]
  95                 elif outline.has_key("text"):
  96                     feed.title = outline["text"]
  97                 else:
  98                     feed.title = "[unknown title]"
  99
 100                 feed.parent = parent
 101                 feed.location = ""
 102
 103                 if outline.has_key("xmlUrl"):
 104                     feed.location = outline["xmlUrl"]
 105                 elif outline.has_key("url"):
 106                     feed.location = outline["url"]
 107
 108                 if outline.has_key("htmlUrl"):
 109                     feed.link = outline["htmlUrl"]
 110                 elif outline.has_key("url"):
 111                     feed.link = outline["url"]
 112                 else:
 113                     feed.link = ""
 114
 115                 save_list.append(feed)
 116
 117             i += 1
 118
 119         return save_list
 120
 121 JobManager.register_handler(OPMLParseJobHandler)
 122
 123 class OPML(dict):
 124     def __init__(self):
 125         self.outlines = []
 126
 127     def output(self, stream = sys.stdout):
 128         xg = XMLGenerator(stream, encoding='utf-8')
 129         def elemWithContent(name, content):
 130             xg.startElement(name, AttributesImpl({}))
 131             if content is not None:
 132                 xg.characters(content)
 133             xg.endElement(name)
 134             xg.characters("\n")
 135         xg.startElement("opml", AttributesImpl({'version': '1.1'}))
 136         xg.startElement("head", AttributesImpl({}))
 137         for key in ('title', 'dateCreated', 'dateModified', 'ownerName',
 138                     'ownerEmail', 'expansionState', 'vertScrollState',
 139                     'windowTop', 'windowBotton', 'windowRight', 'windowLeft'):
 140             if self.has_key(key) and self[key] != "":
 141                 elemWithContent(key, self[key])
 142         xg.endElement("head")
 143         xg.startElement("body", AttributesImpl({}))
 144         for o in self.outlines:
 145             o.output(xg)
 146         xg.endElement("body")
 147         xg.endElement("opml")
 148
 149 class Outline(dict):
 150     __slots__ = ('_children')
 151
 152     def __init__(self):
 153         self._children = []
 154
 155     def add_child(self, outline):
 156         self._children.append(outline)
 157
 158     def get_children_iter(self):
 159         return self.OIterator(self)
 160
 161     children = property(get_children_iter, None, None, "")
 162
 163     def output(self, xg):
 164         xg.startElement("outline", AttributesImpl(self))
 165         for c in self.children:
 166             c.output(xg)
 167         xg.endElement("outline")
 168         xg.characters("\n")
 169
 170     class OIterator:
 171         def __init__(self, o):
 172             self._o = o
 173             self._index = -1
 174
 175         def __iter__(self):
 176             return self
 177
 178         def __len__(self):
 179             return len(self._o._children)
 180
 181         def next(self):
 182             self._index += 1
 183             if self._index < len(self._o._children):
 184                 return self._o._children[self._index]
 185             else:
 186                 raise StopIteration
 187
 188 class OutlineList(object):
 189     def __init__(self):
 190         self._roots = []
 191         self._stack = []
 192
 193     def add_outline(self, outline):
 194         if len(self._stack):
 195             self._stack[-1].add_child(outline)
 196         else:
 197             self._roots.append(outline)
 198         self._stack.append(outline)
 199
 200     def close_outline(self):
 201         if len(self._stack):
 202             del self._stack[-1]
 203
 204     def roots(self):
 205         return self._roots
 206
 207 class OPMLHandler(xml.sax.handler.ContentHandler):
 208     def __init__(self):
 209         self._outlines = OutlineList()
 210         self._opml = None
 211         self._content = ""
 212
 213     def startElement(self, name, attrs):
 214         if self._opml is None:
 215             if name != 'opml':
 216                 raise ValueError, "This doesn't look like OPML"
 217             self._opml = OPML()
 218         if name == 'outline':
 219             o = Outline()
 220             o.update(attrs)
 221             self._outlines.add_outline(o)
 222         self._content = ""
 223
 224     def endElement(self, name):
 225         if name == 'outline':
 226             self._outlines.close_outline()
 227             return
 228         if name == 'opml':
 229             self._opml.outlines = self._outlines.roots()
 230             return
 231         for key in ('title', 'dateCreated', 'dateModified', 'ownerName',
 232                     'ownerEmail', 'expansionState', 'vertScrollState',
 233                     'windowTop', 'windowBotton', 'windowRight', 'windowLeft'):
 234             if name == key:
 235                 self._opml[key] = self._content
 236                 return
 237
 238     def characters(self, ch):
 239         self._content += ch
 240
 241     def get_opml(self):
 242         return self._opml
 243
 244     def get_outlines(self):
 245         return self._outlines
 246
 247 def parse(stream):
 248     """parser = make_parser()
 249     parser.setFeature(feature_namespaces, 0)
 250     handler = OPMLHandler()
 251     parser.setContentHandler(handler)"""
 252     handler = OPMLHandler()
 253     xml.sax.parseString(stream, handler)
 254     print handler.get_outlines()
 255     return handler.get_outlines()
 256
 257 def export(root, filename):
 258     opml = OPML()
 259     opml['title'] = "Exported from Straw"
 260
 261     def _export(node, opml):
 262         o = Outline()
 263
 264         if node.type == "F":
 265             o['text'] = node.title.encode('utf-8')
 266             o['description'] = node.title.encode('utf-8')
 267             o['htmlUrl'] = node.link
 268             o['language'] = 'unknown'
 269             o['title'] = node.title.encode('utf-8')
 270             o['type'] = 'rss'
 271             o['version'] = 'RSS'
 272             o['xmlUrl'] = node.location
 273         elif node.type == "C":
 274             o['text'] = node.name.encode('utf-8')
 275             o['description'] = node.name.encode('utf-8')
 276             o['type'] = 'folder'
 277
 278         for child_node in node.children:
 279             o.add_child(_export(child_node, opml))
 280
 281         return o
 282
 283     opml.outlines.append(_export(root, opml))
 284
 285     f = gnomevfs.create(filename, gnomevfs.OPEN_WRITE, 0)
 286     f.write('<!DOCTYPE opml PUBLIC "-//Userland//DTD OPML XML V1.0//EN" ' + \
 287         '"http://static.userland.com/gems/radiodiscuss/opmlDtd.txt">')
 288     f.write('<?xml version="1.0"?>\n')
 289     opml.output(f)
 290     f.close()
 291
 292 class BlogListEntry(object):
 293     __slots__ = ('text', 'url')
 294
 295 def _find_entries(outline):
 296     entries = []
 297     for c in outline.children:
 298         entries += _find_entries(c)
 299     type = outline.get('type', '')
 300     text = outline.get('text', '')
 301     e = None
 302     if type == 'link':
 303         url = outline.get('url', '')
 304         if url != '':
 305             e = BlogListEntry()
 306             e.text = text
 307             e.url = url
 308     else:
 309         xmlurl = outline.get('xmlUrl', '')
 310         e = BlogListEntry()
 311         e.text = text
 312         if text == '':
 313             title = outline.get('title', '')
 314             if title == '':
 315                 e = None
 316             e.text = title
 317         if e != None:
 318             if xmlurl != '':
 319                 # there's something in xmlurl. There's a good chance that's
 320                 # our feed's URL
 321                 e.url = xmlurl
 322             else:
 323                 htmlurl = outline.get('htmlUrl', '')
 324                 if htmlurl != '':
 325                     # there's something in htmlurl, and xmlurl is empty. This
 326                     # might be our feed's URL.
 327                     e.url = htmlurl
 328                 else:
 329                     # nothing else to try.
 330                     e = None
 331     if e is not None:
 332         entries[0:0] = [e]
 333     return entries
 334
 335 def find_entries(outlines):
 336     entries = []
 337     for o in outlines:
 338         entries += _find_entries(o)
 339     return entries
 340
 341 def read(stream):
 342     o = parse(stream)
 343     return o
 344     entries = find_entries(o.outlines)
 345     ret = list()
 346     edict = dict()
 347     # avoid duplicates.
 348     for e in entries:
 349         ek = (e.text, e.url)
 350         edict[ek] = edict.get(ek, 0) + 1
 351         if edict[ek] < 2:
 352             ret.append(e)
 353     return ret