straw/opml.py

   1 """ OPML.py
   2
   3 """
   4
   5 __copyright__ = "Copyright (c) 2002-2005 Free Software Foundation, Inc."
   6 __author__  = "Juri Pakaste <juri@iki.fi>"
   7 __license__ = """
   8 Straw is free software; you can redistribute it and/or modify it under the
   9 terms of the GNU General Public License as published by the Free Software
  10 Foundation; either version 2 of the License, or (at your option) any later
  11 version.
  12
  13 Straw is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
  15 A PARTICULAR PURPOSE. See the GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License along with
  18 this program; if not, write to the Free Software Foundation, Inc., 59 Temple
  19 Place - Suite 330, Boston, MA 02111-1307, USA. """
  20
  21
  22 from straw.JobManager import Job, TaskThread, TaskInfo, ThreadPoolJobHandler
  23 from straw.model import Category, Feed
  24 from threading import Lock
  25 from xml.sax import saxutils, make_parser, SAXParseException
  26 from xml.sax.handler import feature_namespaces, feature_namespace_prefixes
  27 from xml.sax.saxutils import XMLGenerator
  28 from xml.sax.xmlreader import AttributesImpl
  29 import gnomevfs
  30 import straw.JobManager as JobManager
  31 import sys
  32 import time
  33 import xml.sax._exceptions
  34 import xml.sax.handler
  35
  36 lock = Lock()
  37
  38 class OPMLParseJobHandler(ThreadPoolJobHandler):
  39     job_id = "opml-parse"
  40
  41     def __init__(self, job):
  42         ThreadPoolJobHandler.__init__(self, job)
  43
  44         self.pool_size = 1
  45         self.task_class = OPMLParseTaskThread
  46
  47     def _split(self):
  48         ti = TaskInfo(0, { "file_path": self.job.data[0], "category": self.job.data[1] })
  49         self.task_queue.put(ti)
  50
  51     def _prepare_result(self):
  52         task_result = self.result_queue.get()
  53         category = task_result.task_info.data["category"]
  54         tree = self._build_tree(task_result.result.roots(), parent = category)
  55         return (tree, task_result.task_info.data["category"])
  56
  57     def _build_tree(self, outlines, parent = None):
  58         save_list = []
  59         i = 0
  60
  61         for outline in outlines:
  62             if not outline.has_key("type"):
  63                 # Some feeds exported from Liferea don't have "type" attribute.
  64                 outline["type"] = "rss"
  65
  66             if outline["type"] == "folder" or len(outline.children) > 0:
  67                 category = Category()
  68                 category.norder = i
  69                 category.name = outline["text"]
  70                 category.parent = parent
  71
  72                 save_list.append(category)
  73
  74                 if not outline.children:
  75                     continue
  76
  77                 save_list.extend(self._build_tree(outline.children, category))
  78             else:
  79                 feed = Feed()
  80                 feed.norder = i
  81
  82                 if outline.has_key("title"):
  83                     feed.title = outline["title"]
  84                 elif outline.has_key("text"):
  85                     feed.title = outline["text"]
  86                 else:
  87                     feed.title = "[unknown title]"
  88
  89                 feed.parent = parent
  90                 feed.location = ""
  91
  92                 if outline.has_key("xmlUrl"):
  93                     feed.location = outline["xmlUrl"]
  94                 elif outline.has_key("url"):
  95                     feed.location = outline["url"]
  96
  97                 if outline.has_key("htmlUrl"):
  98                     feed.link = outline["htmlUrl"]
  99                 elif outline.has_key("url"):
 100                     feed.link = outline["url"]
 101                 else:
 102                     feed.link = ""
 103
 104                 save_list.append(feed)
 105
 106             i += 1
 107
 108         return save_list
 109
 110 class OPMLParseTaskThread(TaskThread):
 111     def __init__(self, handler, task_queue, result_queue):
 112         TaskThread.__init__(self, handler, task_queue, result_queue)
 113
 114     def _process(self, task):
 115         opml = None
 116
 117         try:
 118             fstream = open(task.data["file_path"])
 119             opml = read(fstream)
 120         except Exception, inst:
 121             print inst
 122
 123         return opml
 124
 125 JobManager.register_handler(OPMLParseJobHandler)
 126
 127 class OPML(dict):
 128     def __init__(self):
 129         self.outlines = []
 130
 131     def output(self, stream = sys.stdout):
 132         xg = XMLGenerator(stream, encoding='utf-8')
 133         def elemWithContent(name, content):
 134             xg.startElement(name, AttributesImpl({}))
 135             if content is not None:
 136                 xg.characters(content)
 137             xg.endElement(name)
 138             xg.characters("\n")
 139         xg.startElement("opml", AttributesImpl({'version': '1.1'}))
 140         xg.startElement("head", AttributesImpl({}))
 141         for key in ('title', 'dateCreated', 'dateModified', 'ownerName',
 142                     'ownerEmail', 'expansionState', 'vertScrollState',
 143                     'windowTop', 'windowBotton', 'windowRight', 'windowLeft'):
 144             if self.has_key(key) and self[key] != "":
 145                 elemWithContent(key, self[key])
 146         xg.endElement("head")
 147         xg.startElement("body", AttributesImpl({}))
 148         for o in self.outlines:
 149             o.output(xg)
 150         xg.endElement("body")
 151         xg.endElement("opml")
 152
 153 class Outline(dict):
 154     __slots__ = ('_children')
 155
 156     def __init__(self):
 157         self._children = []
 158
 159     def add_child(self, outline):
 160         self._children.append(outline)
 161
 162     def get_children_iter(self):
 163         return self.OIterator(self)
 164
 165     children = property(get_children_iter, None, None, "")
 166
 167     def output(self, xg):
 168         xg.startElement("outline", AttributesImpl(self))
 169         for c in self.children:
 170             c.output(xg)
 171         xg.endElement("outline")
 172         xg.characters("\n")
 173
 174     class OIterator:
 175         def __init__(self, o):
 176             self._o = o
 177             self._index = -1
 178
 179         def __iter__(self):
 180             return self
 181
 182         def __len__(self):
 183             return len(self._o._children)
 184
 185         def next(self):
 186             self._index += 1
 187             if self._index < len(self._o._children):
 188                 return self._o._children[self._index]
 189             else:
 190                 raise StopIteration
 191
 192 class OutlineList(object):
 193     def __init__(self):
 194         self._roots = []
 195         self._stack = []
 196
 197     def add_outline(self, outline):
 198         if len(self._stack):
 199             self._stack[-1].add_child(outline)
 200         else:
 201             self._roots.append(outline)
 202         self._stack.append(outline)
 203
 204     def close_outline(self):
 205         if len(self._stack):
 206             del self._stack[-1]
 207
 208     def roots(self):
 209         return self._roots
 210
 211 class OPMLHandler(xml.sax.handler.ContentHandler):
 212     def __init__(self):
 213         self._outlines = OutlineList()
 214         self._opml = None
 215         self._content = ""
 216
 217     def startElement(self, name, attrs):
 218         if self._opml is None:
 219             if name != 'opml':
 220                 raise ValueError, "This doesn't look like OPML"
 221             self._opml = OPML()
 222         if name == 'outline':
 223             o = Outline()
 224             o.update(attrs)
 225             self._outlines.add_outline(o)
 226         self._content = ""
 227
 228     def endElement(self, name):
 229         if name == 'outline':
 230             self._outlines.close_outline()
 231             return
 232         if name == 'opml':
 233             self._opml.outlines = self._outlines.roots()
 234             return
 235         for key in ('title', 'dateCreated', 'dateModified', 'ownerName',
 236                     'ownerEmail', 'expansionState', 'vertScrollState',
 237                     'windowTop', 'windowBotton', 'windowRight', 'windowLeft'):
 238             if name == key:
 239                 self._opml[key] = self._content
 240                 return
 241
 242     def characters(self, ch):
 243         self._content += ch
 244
 245     def get_opml(self):
 246         return self._opml
 247
 248     def get_outlines(self):
 249         return self._outlines
 250
 251 def parse(stream):
 252     parser = make_parser()
 253     parser.setFeature(feature_namespaces, 0)
 254     handler = OPMLHandler()
 255     parser.setContentHandler(handler)
 256     parser.parse(stream)
 257
 258     return handler.get_outlines()
 259
 260 def export(root, filename):
 261     opml = OPML()
 262     opml['title'] = "Exported from Straw"
 263
 264     def _export(node, opml):
 265         o = Outline()
 266
 267         if node.type == "F":
 268             o['text'] = node.title.encode('utf-8')
 269             o['description'] = node.title.encode('utf-8')
 270             o['htmlUrl'] = node.link
 271             o['language'] = 'unknown'
 272             o['title'] = node.title.encode('utf-8')
 273             o['type'] = 'rss'
 274             o['version'] = 'RSS'
 275             o['xmlUrl'] = node.location
 276         elif node.type == "C":
 277             o['text'] = node.name.encode('utf-8')
 278             o['description'] = node.name.encode('utf-8')
 279             o['type'] = 'folder'
 280
 281         for child_node in node.children:
 282             o.add_child(_export(child_node, opml))
 283
 284         return o
 285
 286     opml.outlines.append(_export(root, opml))
 287
 288     f = gnomevfs.create(filename, gnomevfs.OPEN_WRITE, 0)
 289     f.write('<?xml version="1.0"?>\n')
 290     opml.output(f)
 291     f.close()
 292
 293 class BlogListEntry(object):
 294     __slots__ = ('text', 'url')
 295
 296 def _find_entries(outline):
 297     entries = []
 298     for c in outline.children:
 299         entries += _find_entries(c)
 300     type = outline.get('type', '')
 301     text = outline.get('text', '')
 302     e = None
 303     if type == 'link':
 304         url = outline.get('url', '')
 305         if url != '':
 306             e = BlogListEntry()
 307             e.text = text
 308             e.url = url
 309     else:
 310         xmlurl = outline.get('xmlUrl', '')
 311         e = BlogListEntry()
 312         e.text = text
 313         if text == '':
 314             title = outline.get('title', '')
 315             if title == '':
 316                 e = None
 317             e.text = title
 318         if e != None:
 319             if xmlurl != '':
 320                 # there's something in xmlurl. There's a good chance that's
 321                 # our feed's URL
 322                 e.url = xmlurl
 323             else:
 324                 htmlurl = outline.get('htmlUrl', '')
 325                 if htmlurl != '':
 326                     # there's something in htmlurl, and xmlurl is empty. This
 327                     # might be our feed's URL.
 328                     e.url = htmlurl
 329                 else:
 330                     # nothing else to try.
 331                     e = None
 332     if e is not None:
 333         entries[0:0] = [e]
 334     return entries
 335
 336 def find_entries(outlines):
 337     entries = []
 338     for o in outlines:
 339         entries += _find_entries(o)
 340     return entries
 341
 342 def read(stream):
 343     try:
 344         o = parse(stream)
 345         return o
 346     except ValueError:
 347         return None
 348     entries = find_entries(o.outlines)
 349     ret = list()
 350     edict = dict()
 351     # avoid duplicates.
 352     for e in entries:
 353         ek = (e.text, e.url)
 354         edict[ek] = edict.get(ek, 0) + 1
 355         if edict[ek] < 2:
 356             ret.append(e)
 357     return ret