straw/opml.py

   1 """ OPML.py
   2
   3 """
   4
   5 __copyright__ = "Copyright (c) 2002-2005 Free Software Foundation, Inc."
   6 __author__  = "Juri Pakaste <juri@iki.fi>"
   7 __license__ = """
   8 Straw is free software; you can redistribute it and/or modify it under the
   9 terms of the GNU General Public License as published by the Free Software
  10 Foundation; either version 2 of the License, or (at your option) any later
  11 version.
  12
  13 Straw is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
  15 A PARTICULAR PURPOSE. See the GNU General Public License for more details.
  16
  17 You should have received a copy of the GNU General Public License along with
  18 this program; if not, write to the Free Software Foundation, Inc., 59 Temple
  19 Place - Suite 330, Boston, MA 02111-1307, USA. """
  20
  21
  22 from straw.JobManager import Job, TaskThread, TaskInfo, ThreadPoolJobHandler
  23 from straw.model import Category, Feed
  24 from threading import Lock
  25 from xml.sax import saxutils, make_parser, SAXParseException
  26 from xml.sax.handler import feature_namespaces, feature_namespace_prefixes
  27 from xml.sax.saxutils import XMLGenerator
  28 from xml.sax.xmlreader import AttributesImpl
  29 import gnomevfs
  30 import straw.JobManager as JobManager
  31 import sys
  32 import time
  33 import xml.sax._exceptions
  34 import xml.sax.handler
  35
  36 lock = Lock()
  37
  38 class OPMLParseJobHandler(ThreadPoolJobHandler):
  39     job_id = "opml-parse"
  40
  41     def __init__(self, job):
  42         ThreadPoolJobHandler.__init__(self, job)
  43
  44         self.pool_size = 1
  45         self.task_class = OPMLParseTaskThread
  46
  47     def _split(self):
  48         ti = TaskInfo(0, { "file_path": self.job.data[0], "category": self.job.data[1] })
  49         self.task_queue.put(ti)
  50
  51     def _prepare_result(self):
  52         task_result = self.result_queue.get()
  53         tree = self._build_tree(task_result.result.roots())
  54         return (tree, task_result.task_info.data["category"])
  55
  56     def _build_tree(self, outlines, parent = None):
  57         save_list = []
  58         i = 0
  59
  60         for outline in outlines:
  61             if not outline.has_key("type"):
  62                 # Some feeds exported from Liferea don't have "type" attribute.
  63                 outline["type"] = "rss"
  64
  65             if outline["type"] == "folder" or len(outline.children) > 0:
  66                 category = Category()
  67                 category.norder = i
  68                 category.name = outline["text"]
  69                 category.parent = parent
  70
  71                 save_list.append(category)
  72
  73                 if not outline.children:
  74                     continue
  75
  76                 save_list.extend(self._build_tree(outline.children, category))
  77             else:
  78                 feed = Feed()
  79                 feed.norder = i
  80
  81                 if outline.has_key("title"):
  82                     feed.title = outline["title"]
  83                 elif outline.has_key("text"):
  84                     feed.title = outline["text"]
  85                 else:
  86                     feed.title = "[unknown title]"
  87
  88                 feed.parent = parent
  89                 feed.location = ""
  90
  91                 if outline.has_key("xmlUrl"):
  92                     feed.location = outline["xmlUrl"]
  93                 elif outline.has_key("url"):
  94                     feed.location = outline["url"]
  95
  96                 if outline.has_key("htmlUrl"):
  97                     feed.link = outline["htmlUrl"]
  98                 elif outline.has_key("url"):
  99                     feed.link = outline["url"]
 100                 else:
 101                     feed.link = ""
 102
 103                 save_list.append(feed)
 104
 105             i += 1
 106
 107         return save_list
 108
 109 class OPMLParseTaskThread(TaskThread):
 110     def __init__(self, handler, task_queue, result_queue):
 111         TaskThread.__init__(self, handler, task_queue, result_queue)
 112
 113     def _process(self, task):
 114         opml = None
 115
 116         try:
 117             fstream = open(task.data["file_path"])
 118             opml = read(fstream)
 119         except Exception, inst:
 120             print inst
 121
 122         return opml
 123
 124 JobManager.register_handler(OPMLParseJobHandler)
 125
 126 class OPML(dict):
 127     def __init__(self):
 128         self.outlines = []
 129
 130     def output(self, stream = sys.stdout):
 131         xg = XMLGenerator(stream, encoding='utf-8')
 132         def elemWithContent(name, content):
 133             xg.startElement(name, AttributesImpl({}))
 134             if content is not None:
 135                 xg.characters(content)
 136             xg.endElement(name)
 137             xg.characters("\n")
 138         xg.startElement("opml", AttributesImpl({'version': '1.1'}))
 139         xg.startElement("head", AttributesImpl({}))
 140         for key in ('title', 'dateCreated', 'dateModified', 'ownerName',
 141                     'ownerEmail', 'expansionState', 'vertScrollState',
 142                     'windowTop', 'windowBotton', 'windowRight', 'windowLeft'):
 143             if self.has_key(key) and self[key] != "":
 144                 elemWithContent(key, self[key])
 145         xg.endElement("head")
 146         xg.startElement("body", AttributesImpl({}))
 147         for o in self.outlines:
 148             o.output(xg)
 149         xg.endElement("body")
 150         xg.endElement("opml")
 151
 152 class Outline(dict):
 153     __slots__ = ('_children')
 154
 155     def __init__(self):
 156         self._children = []
 157
 158     def add_child(self, outline):
 159         self._children.append(outline)
 160
 161     def get_children_iter(self):
 162         return self.OIterator(self)
 163
 164     children = property(get_children_iter, None, None, "")
 165
 166     def output(self, xg):
 167         xg.startElement("outline", AttributesImpl(self))
 168         for c in self.children:
 169             c.output(xg)
 170         xg.endElement("outline")
 171         xg.characters("\n")
 172
 173     class OIterator:
 174         def __init__(self, o):
 175             self._o = o
 176             self._index = -1
 177
 178         def __iter__(self):
 179             return self
 180
 181         def __len__(self):
 182             return len(self._o._children)
 183
 184         def next(self):
 185             self._index += 1
 186             if self._index < len(self._o._children):
 187                 return self._o._children[self._index]
 188             else:
 189                 raise StopIteration
 190
 191 class OutlineList(object):
 192     def __init__(self):
 193         self._roots = []
 194         self._stack = []
 195
 196     def add_outline(self, outline):
 197         if len(self._stack):
 198             self._stack[-1].add_child(outline)
 199         else:
 200             self._roots.append(outline)
 201         self._stack.append(outline)
 202
 203     def close_outline(self):
 204         if len(self._stack):
 205             del self._stack[-1]
 206
 207     def roots(self):
 208         return self._roots
 209
 210 class OPMLHandler(xml.sax.handler.ContentHandler):
 211     def __init__(self):
 212         self._outlines = OutlineList()
 213         self._opml = None
 214         self._content = ""
 215
 216     def startElement(self, name, attrs):
 217         if self._opml is None:
 218             if name != 'opml':
 219                 raise ValueError, "This doesn't look like OPML"
 220             self._opml = OPML()
 221         if name == 'outline':
 222             o = Outline()
 223             o.update(attrs)
 224             self._outlines.add_outline(o)
 225         self._content = ""
 226
 227     def endElement(self, name):
 228         if name == 'outline':
 229             self._outlines.close_outline()
 230             return
 231         if name == 'opml':
 232             self._opml.outlines = self._outlines.roots()
 233             return
 234         for key in ('title', 'dateCreated', 'dateModified', 'ownerName',
 235                     'ownerEmail', 'expansionState', 'vertScrollState',
 236                     'windowTop', 'windowBotton', 'windowRight', 'windowLeft'):
 237             if name == key:
 238                 self._opml[key] = self._content
 239                 return
 240
 241     def characters(self, ch):
 242         self._content += ch
 243
 244     def get_opml(self):
 245         return self._opml
 246
 247     def get_outlines(self):
 248         return self._outlines
 249
 250 def parse(stream):
 251     parser = make_parser()
 252     parser.setFeature(feature_namespaces, 0)
 253     handler = OPMLHandler()
 254     parser.setContentHandler(handler)
 255
 256     parser.parse(stream)
 257     return handler.get_outlines()
 258
 259 def export(title, list, fname):
 260     opml = OPML()
 261     opml['title'] = title
 262     for feed in list:
 263         o = Outline()
 264         o['text'] = feed.title.encode('utf-8')
 265         o['description'] = feed.channel_description.encode('utf-8')
 266         o['htmlUrl'] = feed.channel_link
 267         o['language'] = 'unknown'
 268         o['title'] = feed.channel_title.encode('utf-8')
 269         o['type'] = 'rss'
 270         o['version'] = 'RSS'
 271         o['xmlUrl'] = feed.access_info[0]
 272         opml.outlines.append(o)
 273     f = gnomevfs.create(fname, gnomevfs.OPEN_WRITE, 0)
 274     f.write('<?xml version="1.0"?>\n')
 275     opml.output(f)
 276     f.close()
 277
 278 class BlogListEntry(object):
 279     __slots__ = ('text', 'url')
 280
 281 def _find_entries(outline):
 282     entries = []
 283     for c in outline.children:
 284         entries += _find_entries(c)
 285     type = outline.get('type', '')
 286     text = outline.get('text', '')
 287     e = None
 288     if type == 'link':
 289         url = outline.get('url', '')
 290         if url != '':
 291             e = BlogListEntry()
 292             e.text = text
 293             e.url = url
 294     else:
 295         xmlurl = outline.get('xmlUrl', '')
 296         e = BlogListEntry()
 297         e.text = text
 298         if text == '':
 299             title = outline.get('title', '')
 300             if title == '':
 301                 e = None
 302             e.text = title
 303         if e != None:
 304             if xmlurl != '':
 305                 # there's something in xmlurl. There's a good chance that's
 306                 # our feed's URL
 307                 e.url = xmlurl
 308             else:
 309                 htmlurl = outline.get('htmlUrl', '')
 310                 if htmlurl != '':
 311                     # there's something in htmlurl, and xmlurl is empty. This
 312                     # might be our feed's URL.
 313                     e.url = htmlurl
 314                 else:
 315                     # nothing else to try.
 316                     e = None
 317     if e is not None:
 318         entries[0:0] = [e]
 319     return entries
 320
 321 def find_entries(outlines):
 322     entries = []
 323     for o in outlines:
 324         entries += _find_entries(o)
 325     return entries
 326
 327 def read(stream):
 328     try:
 329         o = parse(stream)
 330         return o
 331     except ValueError:
 332         return None
 333     entries = find_entries(o.outlines)
 334     ret = list()
 335     edict = dict()
 336     # avoid duplicates.
 337     for e in entries:
 338         ek = (e.text, e.url)
 339         edict[ek] = edict.get(ek, 0) + 1
 340         if edict[ek] < 2:
 341             ret.append(e)
 342     return ret