Fixes (workarounds) in OPML parsing, more work on GUI...
[straw/fork.git] / straw / opml.py
blob6c24d5a599cddea1f3d12ff9d3614a1e07437940
1 """ OPML.py
3 """
5 __copyright__ = "Copyright (c) 2002-2005 Free Software Foundation, Inc."
6 __author__ = "Juri Pakaste <juri@iki.fi>"
7 __license__ = """
8 Straw is free software; you can redistribute it and/or modify it under the
9 terms of the GNU General Public License as published by the Free Software
10 Foundation; either version 2 of the License, or (at your option) any later
11 version.
13 Straw is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
15 A PARTICULAR PURPOSE. See the GNU General Public License for more details.
17 You should have received a copy of the GNU General Public License along with
18 this program; if not, write to the Free Software Foundation, Inc., 59 Temple
19 Place - Suite 330, Boston, MA 02111-1307, USA. """
22 from straw.JobManager import Job, TaskThread, TaskInfo, ThreadPoolJobHandler
23 from straw.model import Category, Feed
24 from threading import Lock
25 from xml.sax import saxutils, make_parser, SAXParseException
26 from xml.sax.handler import feature_namespaces, feature_namespace_prefixes
27 from xml.sax.saxutils import XMLGenerator
28 from xml.sax.xmlreader import AttributesImpl
29 import gnomevfs
30 import straw.JobManager as JobManager
31 import sys
32 import time
33 import xml.sax._exceptions
34 import xml.sax.handler
36 lock = Lock()
38 class OPMLParseJobHandler(ThreadPoolJobHandler):
39 job_id = "opml-parse"
41 def __init__(self, job):
42 ThreadPoolJobHandler.__init__(self, job)
44 self.pool_size = 1
45 self.task_class = OPMLParseTaskThread
47 def _split(self):
48 ti = TaskInfo(0, { "file_path": self.job.data[0], "category": self.job.data[1] })
49 self.task_queue.put(ti)
51 def _prepare_result(self):
52 task_result = self.result_queue.get()
53 tree = self._build_tree(task_result.result.roots())
54 return (tree, task_result.task_info.data["category"])
56 def _build_tree(self, outlines, parent = None):
57 save_list = []
58 i = 0
60 for outline in outlines:
61 if not outline.has_key("type"):
62 # Some feeds exported from Liferea don't have "type" attribute.
63 outline["type"] = "rss"
65 if outline["type"] == "folder" or len(outline.children) > 0:
66 category = Category()
67 category.norder = i
68 category.name = outline["text"]
69 category.parent = parent
71 save_list.append(category)
73 if not outline.children:
74 continue
76 save_list.extend(self._build_tree(outline.children, category))
77 else:
78 feed = Feed()
79 feed.norder = i
81 if outline.has_key("title"):
82 feed.title = outline["title"]
83 elif outline.has_key("text"):
84 feed.title = outline["text"]
85 else:
86 feed.title = "[unknown title]"
88 feed.parent = parent
89 feed.location = ""
91 if outline.has_key("xmlUrl"):
92 feed.location = outline["xmlUrl"]
93 elif outline.has_key("url"):
94 feed.location = outline["url"]
96 if outline.has_key("htmlUrl"):
97 feed.link = outline["htmlUrl"]
98 elif outline.has_key("url"):
99 feed.link = outline["url"]
100 else:
101 feed.link = ""
103 save_list.append(feed)
105 i += 1
107 return save_list
109 class OPMLParseTaskThread(TaskThread):
110 def __init__(self, handler, task_queue, result_queue):
111 TaskThread.__init__(self, handler, task_queue, result_queue)
113 def _process(self, task):
114 opml = None
116 try:
117 fstream = open(task.data["file_path"])
118 opml = read(fstream)
119 except Exception, inst:
120 print inst
122 return opml
124 JobManager.register_handler(OPMLParseJobHandler)
126 class OPML(dict):
127 def __init__(self):
128 self.outlines = []
130 def output(self, stream = sys.stdout):
131 xg = XMLGenerator(stream, encoding='utf-8')
132 def elemWithContent(name, content):
133 xg.startElement(name, AttributesImpl({}))
134 if content is not None:
135 xg.characters(content)
136 xg.endElement(name)
137 xg.characters("\n")
138 xg.startElement("opml", AttributesImpl({'version': '1.1'}))
139 xg.startElement("head", AttributesImpl({}))
140 for key in ('title', 'dateCreated', 'dateModified', 'ownerName',
141 'ownerEmail', 'expansionState', 'vertScrollState',
142 'windowTop', 'windowBotton', 'windowRight', 'windowLeft'):
143 if self.has_key(key) and self[key] != "":
144 elemWithContent(key, self[key])
145 xg.endElement("head")
146 xg.startElement("body", AttributesImpl({}))
147 for o in self.outlines:
148 o.output(xg)
149 xg.endElement("body")
150 xg.endElement("opml")
152 class Outline(dict):
153 __slots__ = ('_children')
155 def __init__(self):
156 self._children = []
158 def add_child(self, outline):
159 self._children.append(outline)
161 def get_children_iter(self):
162 return self.OIterator(self)
164 children = property(get_children_iter, None, None, "")
166 def output(self, xg):
167 xg.startElement("outline", AttributesImpl(self))
168 for c in self.children:
169 c.output(xg)
170 xg.endElement("outline")
171 xg.characters("\n")
173 class OIterator:
174 def __init__(self, o):
175 self._o = o
176 self._index = -1
178 def __iter__(self):
179 return self
181 def __len__(self):
182 return len(self._o._children)
184 def next(self):
185 self._index += 1
186 if self._index < len(self._o._children):
187 return self._o._children[self._index]
188 else:
189 raise StopIteration
191 class OutlineList(object):
192 def __init__(self):
193 self._roots = []
194 self._stack = []
196 def add_outline(self, outline):
197 if len(self._stack):
198 self._stack[-1].add_child(outline)
199 else:
200 self._roots.append(outline)
201 self._stack.append(outline)
203 def close_outline(self):
204 if len(self._stack):
205 del self._stack[-1]
207 def roots(self):
208 return self._roots
210 class OPMLHandler(xml.sax.handler.ContentHandler):
211 def __init__(self):
212 self._outlines = OutlineList()
213 self._opml = None
214 self._content = ""
216 def startElement(self, name, attrs):
217 if self._opml is None:
218 if name != 'opml':
219 raise ValueError, "This doesn't look like OPML"
220 self._opml = OPML()
221 if name == 'outline':
222 o = Outline()
223 o.update(attrs)
224 self._outlines.add_outline(o)
225 self._content = ""
227 def endElement(self, name):
228 if name == 'outline':
229 self._outlines.close_outline()
230 return
231 if name == 'opml':
232 self._opml.outlines = self._outlines.roots()
233 return
234 for key in ('title', 'dateCreated', 'dateModified', 'ownerName',
235 'ownerEmail', 'expansionState', 'vertScrollState',
236 'windowTop', 'windowBotton', 'windowRight', 'windowLeft'):
237 if name == key:
238 self._opml[key] = self._content
239 return
241 def characters(self, ch):
242 self._content += ch
244 def get_opml(self):
245 return self._opml
247 def get_outlines(self):
248 return self._outlines
250 def parse(stream):
251 parser = make_parser()
252 parser.setFeature(feature_namespaces, 0)
253 handler = OPMLHandler()
254 parser.setContentHandler(handler)
256 parser.parse(stream)
257 return handler.get_outlines()
259 def export(title, list, fname):
260 opml = OPML()
261 opml['title'] = title
262 for feed in list:
263 o = Outline()
264 o['text'] = feed.title.encode('utf-8')
265 o['description'] = feed.channel_description.encode('utf-8')
266 o['htmlUrl'] = feed.channel_link
267 o['language'] = 'unknown'
268 o['title'] = feed.channel_title.encode('utf-8')
269 o['type'] = 'rss'
270 o['version'] = 'RSS'
271 o['xmlUrl'] = feed.access_info[0]
272 opml.outlines.append(o)
273 f = gnomevfs.create(fname, gnomevfs.OPEN_WRITE, 0)
274 f.write('<?xml version="1.0"?>\n')
275 opml.output(f)
276 f.close()
278 class BlogListEntry(object):
279 __slots__ = ('text', 'url')
281 def _find_entries(outline):
282 entries = []
283 for c in outline.children:
284 entries += _find_entries(c)
285 type = outline.get('type', '')
286 text = outline.get('text', '')
287 e = None
288 if type == 'link':
289 url = outline.get('url', '')
290 if url != '':
291 e = BlogListEntry()
292 e.text = text
293 e.url = url
294 else:
295 xmlurl = outline.get('xmlUrl', '')
296 e = BlogListEntry()
297 e.text = text
298 if text == '':
299 title = outline.get('title', '')
300 if title == '':
301 e = None
302 e.text = title
303 if e != None:
304 if xmlurl != '':
305 # there's something in xmlurl. There's a good chance that's
306 # our feed's URL
307 e.url = xmlurl
308 else:
309 htmlurl = outline.get('htmlUrl', '')
310 if htmlurl != '':
311 # there's something in htmlurl, and xmlurl is empty. This
312 # might be our feed's URL.
313 e.url = htmlurl
314 else:
315 # nothing else to try.
316 e = None
317 if e is not None:
318 entries[0:0] = [e]
319 return entries
321 def find_entries(outlines):
322 entries = []
323 for o in outlines:
324 entries += _find_entries(o)
325 return entries
327 def read(stream):
328 try:
329 o = parse(stream)
330 return o
331 except ValueError:
332 return None
333 entries = find_entries(o.outlines)
334 ret = list()
335 edict = dict()
336 # avoid duplicates.
337 for e in entries:
338 ek = (e.text, e.url)
339 edict[ek] = edict.get(ek, 0) + 1
340 if edict[ek] < 2:
341 ret.append(e)
342 return ret