Implemented Fetcher job, refactored JobManager, code cleanup.
[straw.git] / straw / opml.py
blobea303ac64c3ea120e59a5a0678292a702f78e91d
1 """ OPML.py
3 """
5 __copyright__ = "Copyright (c) 2002-2005 Free Software Foundation, Inc."
6 __author__ = "Juri Pakaste <juri@iki.fi>"
7 __license__ = """
8 Straw is free software; you can redistribute it and/or modify it under the
9 terms of the GNU General Public License as published by the Free Software
10 Foundation; either version 2 of the License, or (at your option) any later
11 version.
13 Straw is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
15 A PARTICULAR PURPOSE. See the GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License along with
18 this program; if not, write to the Free Software Foundation, Inc., 59 Temple
19 Place - Suite 330, Boston, MA 02111-1307, USA. """
21 from Fetcher import FetchTask
22 from straw.JobManager import Job, TaskThread, ThreadPoolJobHandler
23 from straw.model import Category, Feed
24 from threading import Lock
25 from xml.sax import saxutils, make_parser, SAXParseException
26 from xml.sax.handler import feature_namespaces, feature_namespace_prefixes
27 from xml.sax.saxutils import XMLGenerator
28 from xml.sax.xmlreader import AttributesImpl
29 import Fetcher
30 import gnomevfs
31 import straw.JobManager as JobManager
32 import sys
33 import time
34 import xml.sax._exceptions
35 import xml.sax.handler
37 lock = Lock()
39 class OPMLParseJobHandler(ThreadPoolJobHandler):
40 job_id = "opml-parse"
42 def __init__(self, job):
43 ThreadPoolJobHandler.__init__(self, job)
45 self.pool_size = 1
46 self.task_class = OPMLParseTaskThread
48 def _on_url_fetched(self, handler, task_result):
49 self.task_queue.put((task_result.task.user_data, task_result.result))
51 def _prepare(self):
52 category = self.job.data[1]
53 url = self.job.data[0]
54 self.job_size = 1
55 fetch_tasks = [FetchTask(url = url, user_data = category)]
56 observers = [{ "task-done": [ self._on_url_fetched ]}]
57 Fetcher.fetch(fetch_tasks, observers = observers)
59 class OPMLParseTaskThread(TaskThread):
60 def __init__(self, handler):
61 TaskThread.__init__(self, handler)
63 def _process(self, task):
64 opml = read(task[1])
65 tree = self._build_tree(opml.roots(), parent = task[0])
66 return tree
68 def _build_tree(self, outlines, parent = None):
69 save_list = []
70 i = 0
72 for outline in outlines:
73 if not outline.has_key("type"):
74 # Some feeds exported from Liferea don't have "type" attribute.
75 outline["type"] = "rss"
77 if outline["type"] == "folder" or len(outline.children) > 0:
78 category = Category()
79 category.norder = i
80 category.name = outline["text"]
81 category.parent = parent
83 save_list.append(category)
85 if not outline.children:
86 continue
88 save_list.extend(self._build_tree(outline.children, category))
89 else:
90 feed = Feed()
91 feed.norder = i
93 if outline.has_key("title"):
94 feed.title = outline["title"]
95 elif outline.has_key("text"):
96 feed.title = outline["text"]
97 else:
98 feed.title = "[unknown title]"
100 feed.parent = parent
101 feed.location = ""
103 if outline.has_key("xmlUrl"):
104 feed.location = outline["xmlUrl"]
105 elif outline.has_key("url"):
106 feed.location = outline["url"]
108 if outline.has_key("htmlUrl"):
109 feed.link = outline["htmlUrl"]
110 elif outline.has_key("url"):
111 feed.link = outline["url"]
112 else:
113 feed.link = ""
115 save_list.append(feed)
117 i += 1
119 return save_list
121 JobManager.register_handler(OPMLParseJobHandler)
123 class OPML(dict):
124 def __init__(self):
125 self.outlines = []
127 def output(self, stream = sys.stdout):
128 xg = XMLGenerator(stream, encoding='utf-8')
129 def elemWithContent(name, content):
130 xg.startElement(name, AttributesImpl({}))
131 if content is not None:
132 xg.characters(content)
133 xg.endElement(name)
134 xg.characters("\n")
135 xg.startElement("opml", AttributesImpl({'version': '1.1'}))
136 xg.startElement("head", AttributesImpl({}))
137 for key in ('title', 'dateCreated', 'dateModified', 'ownerName',
138 'ownerEmail', 'expansionState', 'vertScrollState',
139 'windowTop', 'windowBotton', 'windowRight', 'windowLeft'):
140 if self.has_key(key) and self[key] != "":
141 elemWithContent(key, self[key])
142 xg.endElement("head")
143 xg.startElement("body", AttributesImpl({}))
144 for o in self.outlines:
145 o.output(xg)
146 xg.endElement("body")
147 xg.endElement("opml")
149 class Outline(dict):
150 __slots__ = ('_children')
152 def __init__(self):
153 self._children = []
155 def add_child(self, outline):
156 self._children.append(outline)
158 def get_children_iter(self):
159 return self.OIterator(self)
161 children = property(get_children_iter, None, None, "")
163 def output(self, xg):
164 xg.startElement("outline", AttributesImpl(self))
165 for c in self.children:
166 c.output(xg)
167 xg.endElement("outline")
168 xg.characters("\n")
170 class OIterator:
171 def __init__(self, o):
172 self._o = o
173 self._index = -1
175 def __iter__(self):
176 return self
178 def __len__(self):
179 return len(self._o._children)
181 def next(self):
182 self._index += 1
183 if self._index < len(self._o._children):
184 return self._o._children[self._index]
185 else:
186 raise StopIteration
188 class OutlineList(object):
189 def __init__(self):
190 self._roots = []
191 self._stack = []
193 def add_outline(self, outline):
194 if len(self._stack):
195 self._stack[-1].add_child(outline)
196 else:
197 self._roots.append(outline)
198 self._stack.append(outline)
200 def close_outline(self):
201 if len(self._stack):
202 del self._stack[-1]
204 def roots(self):
205 return self._roots
207 class OPMLHandler(xml.sax.handler.ContentHandler):
208 def __init__(self):
209 self._outlines = OutlineList()
210 self._opml = None
211 self._content = ""
213 def startElement(self, name, attrs):
214 if self._opml is None:
215 if name != 'opml':
216 raise ValueError, "This doesn't look like OPML"
217 self._opml = OPML()
218 if name == 'outline':
219 o = Outline()
220 o.update(attrs)
221 self._outlines.add_outline(o)
222 self._content = ""
224 def endElement(self, name):
225 if name == 'outline':
226 self._outlines.close_outline()
227 return
228 if name == 'opml':
229 self._opml.outlines = self._outlines.roots()
230 return
231 for key in ('title', 'dateCreated', 'dateModified', 'ownerName',
232 'ownerEmail', 'expansionState', 'vertScrollState',
233 'windowTop', 'windowBotton', 'windowRight', 'windowLeft'):
234 if name == key:
235 self._opml[key] = self._content
236 return
238 def characters(self, ch):
239 self._content += ch
241 def get_opml(self):
242 return self._opml
244 def get_outlines(self):
245 return self._outlines
247 def parse(stream):
248 """parser = make_parser()
249 parser.setFeature(feature_namespaces, 0)
250 handler = OPMLHandler()
251 parser.setContentHandler(handler)"""
252 handler = OPMLHandler()
253 xml.sax.parseString(stream, handler)
254 print handler.get_outlines()
255 return handler.get_outlines()
257 def export(root, filename):
258 opml = OPML()
259 opml['title'] = "Exported from Straw"
261 def _export(node, opml):
262 o = Outline()
264 if node.type == "F":
265 o['text'] = node.title.encode('utf-8')
266 o['description'] = node.title.encode('utf-8')
267 o['htmlUrl'] = node.link
268 o['language'] = 'unknown'
269 o['title'] = node.title.encode('utf-8')
270 o['type'] = 'rss'
271 o['version'] = 'RSS'
272 o['xmlUrl'] = node.location
273 elif node.type == "C":
274 o['text'] = node.name.encode('utf-8')
275 o['description'] = node.name.encode('utf-8')
276 o['type'] = 'folder'
278 for child_node in node.children:
279 o.add_child(_export(child_node, opml))
281 return o
283 opml.outlines.append(_export(root, opml))
285 f = gnomevfs.create(filename, gnomevfs.OPEN_WRITE, 0)
286 f.write('<!DOCTYPE opml PUBLIC "-//Userland//DTD OPML XML V1.0//EN" ' + \
287 '"http://static.userland.com/gems/radiodiscuss/opmlDtd.txt">')
288 f.write('<?xml version="1.0"?>\n')
289 opml.output(f)
290 f.close()
292 class BlogListEntry(object):
293 __slots__ = ('text', 'url')
295 def _find_entries(outline):
296 entries = []
297 for c in outline.children:
298 entries += _find_entries(c)
299 type = outline.get('type', '')
300 text = outline.get('text', '')
301 e = None
302 if type == 'link':
303 url = outline.get('url', '')
304 if url != '':
305 e = BlogListEntry()
306 e.text = text
307 e.url = url
308 else:
309 xmlurl = outline.get('xmlUrl', '')
310 e = BlogListEntry()
311 e.text = text
312 if text == '':
313 title = outline.get('title', '')
314 if title == '':
315 e = None
316 e.text = title
317 if e != None:
318 if xmlurl != '':
319 # there's something in xmlurl. There's a good chance that's
320 # our feed's URL
321 e.url = xmlurl
322 else:
323 htmlurl = outline.get('htmlUrl', '')
324 if htmlurl != '':
325 # there's something in htmlurl, and xmlurl is empty. This
326 # might be our feed's URL.
327 e.url = htmlurl
328 else:
329 # nothing else to try.
330 e = None
331 if e is not None:
332 entries[0:0] = [e]
333 return entries
335 def find_entries(outlines):
336 entries = []
337 for o in outlines:
338 entries += _find_entries(o)
339 return entries
341 def read(stream):
342 o = parse(stream)
343 return o
344 entries = find_entries(o.outlines)
345 ret = list()
346 edict = dict()
347 # avoid duplicates.
348 for e in entries:
349 ek = (e.text, e.url)
350 edict[ek] = edict.get(ek, 0) + 1
351 if edict[ek] < 2:
352 ret.append(e)
353 return ret