Updated to support automatic internal retry in case of malformed XML answer
[radcan.git] / radcan.py
blob96080dd8ba30b000fe7ab71961d4cc3a719fcee1
1 """
2 Client to Radio-Canada's Video-on-Demand web service. It can be reused
3 as a module, or invoked as a command line script.
4 """
5 #-------------------------------------------------------------------------------
6 # Legalese
8 __copyright__ = \
9 '''Copyright(C), 2007, Sylvain Fourmanoit <syfou@users.sourceforge.net>, 2007.
11 Permission is hereby granted, free of charge, to any person obtaining a copy
12 of this software and associated documentation files (the "Software"), to
13 deal in the Software without restriction, including without limitation the
14 rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
15 sell copies of the Software, and to permit persons to whom the Software is
16 furnished to do so, subject to the following conditions:
18 The above copyright notice and this permission notice shall be included in
19 all copies of the Software and its documentation and acknowledgment shall be
20 given in the documentation and software packages that this Software was
21 used.
23 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
24 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
25 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
26 THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
27 IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
28 CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
29 '''
30 __version__ = '0.9.3'
32 #-------------------------------------------------------------------------------
34 import httplib, sys, os, optparse, pprint
36 try:
37 import xml.etree.ElementTree as etree
38 except ImportError:
39 import elementtree.ElementTree as etree
41 import xml.parsers.expat
43 #-------------------------------------------------------------------------------
44 # Utility class
45 class Missing(dict):
46 """__missing__ keyword emulating container for python 2.4"""
47 def __init__(self, iterable=None, default=''):
48 self.default = default
49 if iterable: dict.__init__(self, iterable)
50 def __getitem__(self, k):
51 try:
52 return dict.__getitem__(self, k)
53 except KeyError:
54 try:
55 return self.default % k
56 except TypeError:
57 return self.default
59 #-------------------------------------------------------------------------------
60 # XML answer processing
62 class XML2Py:
63 """
64 Convert an arbitrary XML file to a pythonic structure.
66 Right now, it is instanciated from the Console.SOAPQuery instances
67 during calls to console: this could be modularised (we could
68 basically just return a file-like object, and let the caller deals
69 with the xml content) but we didn't see a real need: set
70 Console.debug to true if ever you need to debug.
72 Be aware that dynamically changing XML2Py will likely break
73 Console.GetList2Alt().
74 """
75 class TreeBuilder(etree.XMLTreeBuilder):
76 """Get rid of all the name space cruft"""
77 def _fixname(self, key): return key.split('}')[-1]
79 def __call__(self, f):
80 return self._myiter(
81 etree.parse(f, self.TreeBuilder()).getroot())
83 def _myiter(self, root):
84 def cond(subtree, text):
85 try:
86 if len(subtree)>0:
87 return subtree
88 elif text is None:
89 return None
90 elif text.isdigit():
91 return int(text)
92 elif text.lower() == 'true':
93 return True
94 elif text.lower() == 'false':
95 return False
96 except: pass
97 return text
98 tree = {}
99 for e in root:
100 subtree = self._myiter(e)
101 if e.tag in tree:
102 if type(tree[e.tag]) is list:
103 tree[e.tag].append(cond(subtree, e.text))
104 else:
105 tree[e.tag]= [tree[e.tag], cond(subtree, e.text)]
106 else:
107 tree[e.tag] = cond(subtree, e.text)
108 if len(tree) == 1:
109 tree = tree.values()[0]
110 return tree
112 #-------------------------------------------------------------------------------
113 # SOAP interface access
115 class Console:
117 Radio-Canada querying engine. Just use something like:
119 console.GetLists2(MotsCle='politique')
121 and you are done!
123 queries = {
124 'GetEmissions': {
125 'fields': {'IDRegion': 1}
127 'GetListSuggere': {
128 'fields': {'IdMedia': -1,
129 'IdRegion': -1}
131 'GetListAujourdhui': {
132 'fields': {'IdRegion': -1}
134 'GetList2': {
135 'fields': {'IDEmission': 'del',
136 'Chaine': None,
137 'Video': -1,
138 'IDTypeContenu': 0,
139 'MotsCle': None,
140 'StartItem': 1,
141 'NbrItem': 10,
142 'DateOffset': 7,
143 'NbJours': 0,
144 'Tri': 'date:D',
145 'StrIDTypeContenu': None,
146 'StrIDCategorie': None,
147 'StrIDGenre': None,
148 'IDReseau': 0,
149 'IDRegion': -1,
150 'Integrale': -1},
151 'prebody': '<requete>',
152 'postbody':'</requete>'}
155 class SOAPQuery:
157 Query a given port of Radio-Canada SOAP interface.
159 Returned by Console.__getattr__().
161 request = '''<?xml version="1.0" encoding="utf-8"?>
162 <soap:Envelope xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
163 xmlns:xsd="http://www.w3.org/2001/XMLSchema"
164 xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/">
165 <soap:Body>
166 <%(port)s xmlns="http://tempuri.org/">
167 %(prebody)s
168 %(body)s
169 %(postbody)s
170 </%(port)s>
171 </soap:Body>
172 </soap:Envelope>'''
173 class Attrs(dict):
174 def __missing__(self, name): return ''
175 class Tee:
176 '''Automated echo of file-like objects on read'''
177 def __init__(self, f, stream=None, tee=True):
178 self.f = f
179 if stream is not None:
180 self.stream = stream
181 else:
182 self.stream = sys.stdout
183 self.tee = tee
185 def read(self, size=-1):
186 s = self.f.read(size)
187 if self.tee:
188 self.stream.write(s)
189 return s
191 def __init__(self, **kw):
192 self.kw = Missing(kw)
194 def splitproxy(self, http_proxy):
195 if 'http://' == http_proxy[:7]:
196 return http_proxy[7:].split(':')
197 else:
198 raise RuntimeError('http proxy is not in the expected ' +
199 'http://host:port format')
201 def __call__(self, **fields):
202 def tag(k, v):
203 if v is not None:
204 return '<%s>%s</%s>' % (k, v, k)
205 else:
206 return '<%s/>' % k
208 # Prepare the request
210 self.kw['body'] = '\n'.join([tag(k, v) for k, v in
211 dict(self.kw['fields'].items() +
212 fields.items()).items()
213 if v != 'del'])
214 request = self.request % self.kw
215 headers = {'Content-Type': 'text/xml',
216 'SOAPAction': 'http://tempuri.org/%(port)s' % self.kw }
218 # Set up the connection parameters, either directly or through
219 # a proxy
220 if self.kw['proxy'] is not None:
221 host, port = self.splitproxy(self.kw['proxy'])
222 uri = 'http://www1.radio-canada.ca/aspx/WSConsole/console.asmx'
223 else:
224 host, port = ('www1.radio-canada.ca', 80)
225 uri = '/aspx/WSConsole/console.asmx'
227 # Finally, time to connect
228 for i in range(self.kw['retry']):
229 conn = httplib.HTTPConnection(host, port)
230 conn.set_debuglevel((0,2)[self.kw['debug'] is True])
231 conn.connect()
232 conn.request('POST', uri, request, headers)
233 try:
234 ret = XML2Py()(self.Tee(conn.getresponse(), tee=self.kw['debug']))
235 break
236 except xml.parsers.expat.ExpatError:
237 ret = {}
238 return ret
240 def __init__(self, debug=False, proxy=None, retry= 3):
242 Initialize the console, setting the debug flag, http proxy,
243 and number of retries in case of malformed answer.
245 proxy is expected to be a string, of the usual http://host:port
246 format, following the unix-style http_proxy environment variable.
248 self.debug = debug
249 self.proxy = proxy
250 self.retry = retry
252 def __getattr__(self, port):
254 Call a SOAP port, as defined in Radio-Canada's spec:
256 http://www1.radio-canada.ca/aspx/WSConsole/console.asmx?WSDL
258 # We do support calls to arbitrary ports, but we also
259 # includes out-of-the-box values for a couple of them:
260 # see self.queries.
261 if port[:2] == '__':
262 raise AttributeError('no attribute %s' % port)
263 else:
264 return self.SOAPQuery(port = port, debug = self.debug,
265 proxy = self.proxy, retry = self.retry,
266 **self.queries.get(port,{'fields':{}}))
268 def GetList2Alt(self, **kw):
270 Special wrapper to GetList2 port: for the sake of uniformity,
271 it is significantly easier to deal with some medias iterable
272 (as with other ports such as GetListAujourdhui or
273 GetListSuggere): this is what this method supplies, by calling
274 the port by chunks of ten items, then spoonfeeding the result
275 to the caller.
276 """
277 for k in ('NbrItem', 'StartItem'):
278 if k in kw:
279 del kw[k]
281 r = self.GetList2(NbrItem=1, **kw)
282 if 'Erreur' in r and r['Erreur'] is not None:
283 print >> sys.stderr, 'server error:', r['Erreur']
284 if 'NbResultats' not in r:
285 return
286 for i in range(1, r['NbResultats'], 10):
287 r = self.GetList2(StartItem=i, NbrItem=10, **kw)
288 for media in r['Medias']:
289 yield media
291 #-------------------------------------------------------------------------------
292 # Generic formatting
294 # Here is a couple of templates used by cli()...
296 templates = {'verboseMedia' : '''CLip %(Pos)d (Media ID %(IDMedia)s)
297 Diffusion: %(HeureDiffusionStr)s %(DateDiffusionStr)s
298 Duration: %(Duree)s seconds
299 Broadcast: %(NomEmission)s (Broadcast ID %(IDEmission)s)
300 Description: [%(ExtraitTitre)s]
301 Network: %(NomReseau)s
302 Integral: %(IsIntegral)s
303 Video: %(Video)s
304 URI: %(LienASX)s
305 ''' + '='*80,
306 'linkMedia': '%(LienASX)s',
307 'broadcasts': '%(Nom)s (ID %(IDEmission)d)'
310 # ... And there is the templating "engine"
312 def apply_template(iterable, template=None, missing = '<%s: N/A>',
313 items=None, encoding='utf-8', stream=None):
315 Output some query results obtained from Console on file-like
316 stream (or sys.stdout if none given), after formatting them; the
317 items yield by the iterable are expected to support being mapped
318 on a dictionnary.
320 See cli() for example on how it can be used.
322 if stream is None: stream = sys.stdout
324 for i, item in enumerate(iterable):
325 if items is None or i in items:
326 if template is None:
327 print >> stream, ('--- Clip %d ' % i) + '-'*60
328 pprint.pprint(item, stream=stream)
329 else:
330 item['Pos'] = i
331 print >>stream, (template %
332 Missing(item, missing)).encode(encoding)
334 #-------------------------------------------------------------------------------
335 # Now, specify a simple CLI interface
337 def cli():
339 Simple CLI interface covering the most useful/common cases (looks
340 at sys.argv to decide what to do).
343 # Generate and parse the command line
345 p = optparse.OptionParser(
346 usage="""%prog [options]
348 Radio-Canada's Video on demand non-interactive command line interface.""",
349 version='''%%prog %s
350 Copyright (C) 2007 Sylvain Fourmanoit <syfou@users.sourceforge.net>.
351 This is free software; see the source for copying conditions. There is NO
352 warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
353 '''.strip() % __version__)
354 p.add_option('-b', '--broadcasts',
355 dest='list_broadcasts', action="store_true", default=False,
356 help='list all available broadcasts')
357 p.add_option('-s', '--suggested',
358 dest='list_suggested', action="store_true", default=False,
359 help='list suggested clips')
360 p.add_option('-t', '--today',
361 dest='list_today', action='store_true', default=False,
362 help='list today\'s clips')
363 p.add_option('-q', '--query',
364 dest='broadcast_id', default=None,
365 help='''query clips for a given broadcast (use --broadcasts
366 for a list of all possible broadcast IDs)''')
367 p.add_option('-k', '--search',
368 dest='keywords', default=None,
369 help='''query clips based on keyword search''')
370 p.add_option('-d', '--date-offset',
371 dest='days', default=7,
372 help='''specify how far in time to look back (default is 7,
373 i.e. looking one week back): this only applies to
374 --query and --search requests''')
375 p.add_option('-i', '--items',
376 dest='items', default=None,
377 help='''limit output to a list of comma-separated items,
378 based on their indexed position in the server
379 response (for instance, specifying "0, 2" will
380 make the script printout only the templated output
381 for the first and third item''')
382 p.add_option('--template',
383 dest='template', default=None,
384 help='''manually specify output template to replace default
385 (advanced usage: read the code if ever you need
386 this)''')
387 p.add_option('--encoding',
388 dest='encoding', default='utf-8',
389 help='specify output console encoding (default: utf-8)')
390 p.add_option('-r', '--raw',
391 dest='raw', action="store_true", default=False,
392 help='''Force brute, detailed output (ignore any template):
393 work for all requests''')
394 p.add_option('-a', '--asx',
395 dest='asx', action="store_true", default=False,
396 help='''Force output of ASX links only (ignore any template):
397 make sense for all but --broadcasts requests''')
398 p.add_option('-p', '--proxy',
399 dest='http_proxy', default=None,
400 help='''specify what http proxy to use, as a string of
401 the form "http://host:port". By default,
402 the content of environment variable http_proxy
403 is used, if set. If neither this option nor
404 the http_proxy variable is specified, the script
405 connects to Radio-Canada directly''')
406 p.add_option('--retry',
407 dest='retry', default=3, type="int",
408 help='''specify how many times to resend a request in case of
409 a malformed XML answer. Default is 3 times.''')
410 p.add_option('--debug',
411 dest='debug', action="store_true", default=False,
412 help='''Send complete trace of client<->server transactions
413 on stdout (data sent and data received,
414 including headers)''')
416 opts, args = p.parse_args()
418 # Initialize the console
420 if opts.http_proxy is None: opts.http_proxy = os.getenv('http_proxy')
421 console = Console(debug=opts.debug, proxy=opts.http_proxy, retry=opts.retry)
423 # Set the various parameters based on mode
425 # Default keywords and templates...
426 kw = {}
427 template = templates['verboseMedia']
429 # Then, make adjustments based on invokation
431 if opts.list_broadcasts:
432 template = templates['broadcasts']
433 port = console.GetEmissions
434 elif opts.list_suggested:
435 port = console.GetListSuggere
436 elif opts.list_today:
437 port = console.GetListAujourdhui
438 elif opts.broadcast_id is not None or opts.keywords is not None:
439 if opts.broadcast_id is not None: kw['IDEmission'] = opts.broadcast_id
440 if opts.keywords is not None: kw['MotsCle'] = opts.keywords
441 kw['DateOffset'] = opts.days
442 port = console.GetList2Alt
443 else:
444 p.error(' '.join(
445 ['no request specified (one of --broadcasts, --suggested,',
446 '--today, --query or --search), bailing out. See --help',
447 'for details.']))
449 # Make last minutes adjustments to the template
451 if opts.template is not None: template = opts.template
452 if opts.raw: template=None
453 if opts.asx: template=templates['linkMedia']
455 # And check for items output limitations
457 if opts.items is not None:
458 opts.items = [int(i) for i in opts.items.split(',')]
460 # Fire in the hole!
462 try:
463 apply_template(port(**kw), items=opts.items,
464 template=template, encoding=opts.encoding)
465 except RuntimeError, e:
466 p.error(str(e))
467 except:
468 print >> sys.stderr, 'An error occured while processing the request:'
469 raise
471 #-------------------------------------------------------------------------------
473 if __name__ == '__main__':
474 cli()