Improved handling of exceptions in TaskThread.
[straw.git] / straw / opml.py
blob8a283d88af6f561c2d421bfbf7d3ebd8cee4ff2e
1 """ OPML.py
3 """
5 __copyright__ = "Copyright (c) 2002-2005 Free Software Foundation, Inc."
6 __author__ = "Juri Pakaste <juri@iki.fi>"
7 __license__ = """
8 Straw is free software; you can redistribute it and/or modify it under the
9 terms of the GNU General Public License as published by the Free Software
10 Foundation; either version 2 of the License, or (at your option) any later
11 version.
13 Straw is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
15 A PARTICULAR PURPOSE. See the GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License along with
18 this program; if not, write to the Free Software Foundation, Inc., 59 Temple
19 Place - Suite 330, Boston, MA 02111-1307, USA. """
22 from straw.JobManager import Job, TaskThread, TaskInfo, ThreadPoolJobHandler
23 from straw.model import Category, Feed
24 from threading import Lock
25 from xml.sax import saxutils, make_parser, SAXParseException
26 from xml.sax.handler import feature_namespaces, feature_namespace_prefixes
27 from xml.sax.saxutils import XMLGenerator
28 from xml.sax.xmlreader import AttributesImpl
29 import gnomevfs
30 import straw.JobManager as JobManager
31 import sys
32 import time
33 import xml.sax._exceptions
34 import xml.sax.handler
36 lock = Lock()
38 class OPMLParseJobHandler(ThreadPoolJobHandler):
39 job_id = "opml-parse"
41 def __init__(self, job):
42 ThreadPoolJobHandler.__init__(self, job)
44 self.pool_size = 1
45 self.task_class = OPMLParseTaskThread
47 def _split(self):
48 ti = TaskInfo(0, { "file_path": self.job.data[0], "category": self.job.data[1] })
49 self.task_queue.put(ti)
51 def _prepare_result(self):
52 task_result = self.result_queue.get()
53 category = task_result.task_info.data["category"]
54 tree = self._build_tree(task_result.result.roots(), parent = category)
55 return (tree, task_result.task_info.data["category"])
57 def _build_tree(self, outlines, parent = None):
58 save_list = []
59 i = 0
61 for outline in outlines:
62 if not outline.has_key("type"):
63 # Some feeds exported from Liferea don't have "type" attribute.
64 outline["type"] = "rss"
66 if outline["type"] == "folder" or len(outline.children) > 0:
67 category = Category()
68 category.norder = i
69 category.name = outline["text"]
70 category.parent = parent
72 save_list.append(category)
74 if not outline.children:
75 continue
77 save_list.extend(self._build_tree(outline.children, category))
78 else:
79 feed = Feed()
80 feed.norder = i
82 if outline.has_key("title"):
83 feed.title = outline["title"]
84 elif outline.has_key("text"):
85 feed.title = outline["text"]
86 else:
87 feed.title = "[unknown title]"
89 feed.parent = parent
90 feed.location = ""
92 if outline.has_key("xmlUrl"):
93 feed.location = outline["xmlUrl"]
94 elif outline.has_key("url"):
95 feed.location = outline["url"]
97 if outline.has_key("htmlUrl"):
98 feed.link = outline["htmlUrl"]
99 elif outline.has_key("url"):
100 feed.link = outline["url"]
101 else:
102 feed.link = ""
104 save_list.append(feed)
106 i += 1
108 return save_list
110 class OPMLParseTaskThread(TaskThread):
111 def __init__(self, handler, task_queue, result_queue):
112 TaskThread.__init__(self, handler, task_queue, result_queue)
114 def _process(self, task):
115 opml = None
117 try:
118 fstream = open(task.data["file_path"])
119 opml = read(fstream)
120 except Exception, inst:
121 print inst
123 return opml
125 JobManager.register_handler(OPMLParseJobHandler)
127 class OPML(dict):
128 def __init__(self):
129 self.outlines = []
131 def output(self, stream = sys.stdout):
132 xg = XMLGenerator(stream, encoding='utf-8')
133 def elemWithContent(name, content):
134 xg.startElement(name, AttributesImpl({}))
135 if content is not None:
136 xg.characters(content)
137 xg.endElement(name)
138 xg.characters("\n")
139 xg.startElement("opml", AttributesImpl({'version': '1.1'}))
140 xg.startElement("head", AttributesImpl({}))
141 for key in ('title', 'dateCreated', 'dateModified', 'ownerName',
142 'ownerEmail', 'expansionState', 'vertScrollState',
143 'windowTop', 'windowBotton', 'windowRight', 'windowLeft'):
144 if self.has_key(key) and self[key] != "":
145 elemWithContent(key, self[key])
146 xg.endElement("head")
147 xg.startElement("body", AttributesImpl({}))
148 for o in self.outlines:
149 o.output(xg)
150 xg.endElement("body")
151 xg.endElement("opml")
153 class Outline(dict):
154 __slots__ = ('_children')
156 def __init__(self):
157 self._children = []
159 def add_child(self, outline):
160 self._children.append(outline)
162 def get_children_iter(self):
163 return self.OIterator(self)
165 children = property(get_children_iter, None, None, "")
167 def output(self, xg):
168 xg.startElement("outline", AttributesImpl(self))
169 for c in self.children:
170 c.output(xg)
171 xg.endElement("outline")
172 xg.characters("\n")
174 class OIterator:
175 def __init__(self, o):
176 self._o = o
177 self._index = -1
179 def __iter__(self):
180 return self
182 def __len__(self):
183 return len(self._o._children)
185 def next(self):
186 self._index += 1
187 if self._index < len(self._o._children):
188 return self._o._children[self._index]
189 else:
190 raise StopIteration
192 class OutlineList(object):
193 def __init__(self):
194 self._roots = []
195 self._stack = []
197 def add_outline(self, outline):
198 if len(self._stack):
199 self._stack[-1].add_child(outline)
200 else:
201 self._roots.append(outline)
202 self._stack.append(outline)
204 def close_outline(self):
205 if len(self._stack):
206 del self._stack[-1]
208 def roots(self):
209 return self._roots
211 class OPMLHandler(xml.sax.handler.ContentHandler):
212 def __init__(self):
213 self._outlines = OutlineList()
214 self._opml = None
215 self._content = ""
217 def startElement(self, name, attrs):
218 if self._opml is None:
219 if name != 'opml':
220 raise ValueError, "This doesn't look like OPML"
221 self._opml = OPML()
222 if name == 'outline':
223 o = Outline()
224 o.update(attrs)
225 self._outlines.add_outline(o)
226 self._content = ""
228 def endElement(self, name):
229 if name == 'outline':
230 self._outlines.close_outline()
231 return
232 if name == 'opml':
233 self._opml.outlines = self._outlines.roots()
234 return
235 for key in ('title', 'dateCreated', 'dateModified', 'ownerName',
236 'ownerEmail', 'expansionState', 'vertScrollState',
237 'windowTop', 'windowBotton', 'windowRight', 'windowLeft'):
238 if name == key:
239 self._opml[key] = self._content
240 return
242 def characters(self, ch):
243 self._content += ch
245 def get_opml(self):
246 return self._opml
248 def get_outlines(self):
249 return self._outlines
251 def parse(stream):
252 parser = make_parser()
253 parser.setFeature(feature_namespaces, 0)
254 handler = OPMLHandler()
255 parser.setContentHandler(handler)
256 parser.parse(stream)
258 return handler.get_outlines()
260 def export(root, filename):
261 opml = OPML()
262 opml['title'] = "Exported from Straw"
264 def _export(node, opml):
265 o = Outline()
267 if node.type == "F":
268 o['text'] = node.title.encode('utf-8')
269 o['description'] = node.title.encode('utf-8')
270 o['htmlUrl'] = node.link
271 o['language'] = 'unknown'
272 o['title'] = node.title.encode('utf-8')
273 o['type'] = 'rss'
274 o['version'] = 'RSS'
275 o['xmlUrl'] = node.location
276 elif node.type == "C":
277 o['text'] = node.name.encode('utf-8')
278 o['description'] = node.name.encode('utf-8')
279 o['type'] = 'folder'
281 for child_node in node.children:
282 o.add_child(_export(child_node, opml))
284 return o
286 opml.outlines.append(_export(root, opml))
288 f = gnomevfs.create(filename, gnomevfs.OPEN_WRITE, 0)
289 f.write('<?xml version="1.0"?>\n')
290 opml.output(f)
291 f.close()
293 class BlogListEntry(object):
294 __slots__ = ('text', 'url')
296 def _find_entries(outline):
297 entries = []
298 for c in outline.children:
299 entries += _find_entries(c)
300 type = outline.get('type', '')
301 text = outline.get('text', '')
302 e = None
303 if type == 'link':
304 url = outline.get('url', '')
305 if url != '':
306 e = BlogListEntry()
307 e.text = text
308 e.url = url
309 else:
310 xmlurl = outline.get('xmlUrl', '')
311 e = BlogListEntry()
312 e.text = text
313 if text == '':
314 title = outline.get('title', '')
315 if title == '':
316 e = None
317 e.text = title
318 if e != None:
319 if xmlurl != '':
320 # there's something in xmlurl. There's a good chance that's
321 # our feed's URL
322 e.url = xmlurl
323 else:
324 htmlurl = outline.get('htmlUrl', '')
325 if htmlurl != '':
326 # there's something in htmlurl, and xmlurl is empty. This
327 # might be our feed's URL.
328 e.url = htmlurl
329 else:
330 # nothing else to try.
331 e = None
332 if e is not None:
333 entries[0:0] = [e]
334 return entries
336 def find_entries(outlines):
337 entries = []
338 for o in outlines:
339 entries += _find_entries(o)
340 return entries
342 def read(stream):
343 try:
344 o = parse(stream)
345 return o
346 except ValueError:
347 return None
348 entries = find_entries(o.outlines)
349 ret = list()
350 edict = dict()
351 # avoid duplicates.
352 for e in entries:
353 ek = (e.text, e.url)
354 edict[ek] = edict.get(ek, 0) + 1
355 if edict[ek] < 2:
356 ret.append(e)
357 return ret