2 ###########################################################################
3 # Copyright (C) 2007 by Andrew Mahone
4 # <andrew.mahone@gmail.com>
6 # Copyright: See COPYING file that comes with this distribution
8 ###########################################################################
9 """Simple interface for retrieving comics"""
10 import time
, urllib
, urllib2
, random
, yaml
, re
, BeautifulSoup
, \
11 copy
, urlparse
, os
.path
, NameEnc
, sys
, mimetypes
, threading
, traceback
12 from signal
import SIG_IGN
, SIGINT
, signal
13 from Comic
import DB
, Conf
, ComicLoader
, Magic
, InitDB
14 from htmlentitydefs
import name2codepoint
15 from pkg_resources
import resource_stream
16 HaltLock
= threading
.Lock()
18 REType
= type(re
.compile(''))
24 def proxy(*args
, **kw
):
26 raise KeyboardInterrupt
27 return func(*args
, **kw
)
30 def Merge(infile
, theclass
, thedict
):
32 if isinstance(infile
,basestring
):
33 infile
= open(infile
, 'rb')
34 newdefs
= yaml
.load(infile
, Loader
=ComicLoader
)
35 newdefs
= [ (c
[0], theclass(name
=c
[0],**c
[1])) for c
in \
37 thedict
.update(newdefs
)
40 def FetchURL(url
, referer
=None, outfile
=None):
42 Fetches the requested url, with referer if desired. If outfile is
43 specified, returns url info, otherwise returns (data, url info).
44 Exceptions other than KeyboardInterrupt will be ignored silently until
45 the specified number of attempts have been made to retrieve the url.
46 Fetch will sleep for the specified delay before attempt, with the delay
47 increasing exponentially.
50 attempts
= Conf
.dl_attempts
54 time
.sleep(delay
* (1 + random
.random() * Conf
.dl_delay_rand
))
55 delay
*= Conf
.dl_delay_mul
57 if isinstance(outfile
, basestring
):
58 outfile
= open(outfile
, 'wb')
63 req
= urllib2
.Request(url
)
65 req
.add_header('Referer',referer
)
66 handle
= urllib2
.urlopen(req
)
69 raise KeyboardInterrupt
70 dat
= handle
.read(4096)
82 if i
.has_key('Content-Length'):
87 if n
!= int(i
['Content-Length']):
88 raise EOFError('Received incomplete file from server')
93 except KeyboardInterrupt:
96 if not attempts
: raise
98 _EntityRE
= re
.compile('(?:&((?:#(\d+)|([A-Za-z]+));)|(["\'&<>]))')
145 def ParseEntities(input,OutputXML
=True):
148 for m
in re
.finditer(_EntityRE
, input):
150 output
.append(input[last
:m
.start()])
155 if ch
in _XMLEntities
and OutputXML
:
156 output
.append(_XMLEntities
[ch
])
158 elif ch
in _MSEntities
:
161 output
.append(unichr(ch
))
165 if m
.group(3) in _XMLEntities
and OutputXML
:
166 output
.append(_XMLEntities
[ch
])
168 if m
.group(3) in name2codepoint
:
169 ch
= name2codepoint
[m
.group(3)]
172 output
.append(unichr(ch
))
177 output
.append('&')
179 output
.append(m
.group(1))
181 output
.append(m
.group(0))
184 output
.append(_XMLEntities
[m
.group(4)])
186 output
.append(m
.group(4))
187 output
.append(input[last
:])
188 return ''.join(output
)
190 _DefaultFileRE
= re
.compile('(?i)comics?/')
191 _DefaultAnchorURLMap
= ('attrs', 'href')
192 _DefaultImageURLMap
= ('attrs', 'src')
196 'contents': re
.compile('(?i)previous'),
199 'url': _DefaultAnchorURLMap
204 'contents': re
.compile('(?i)prev'),
207 'url': _DefaultAnchorURLMap
212 'contents': re
.compile('(?i)back'),
215 'url': _DefaultAnchorURLMap
222 'src': re
.compile('(?i)(?:^|/)comics?/')
225 'url': _DefaultImageURLMap
229 def TextFullMatch(item
, text
):
230 if isinstance(item
, basestring
):
232 elif isinstance(item
, REType
):
233 return not item
.search(text
) is None
237 def TextSubMatch(item
, text
):
238 if isinstance(item
, basestring
): return item
in text
239 if isinstance(item
, (list, tuple)):
241 if i
in text
: return True
244 return item
.search(text
)
246 def ItemMatch(item
, tag
):
247 if 'name' in item
and not TextFullMatch(item
['name'], tag
.name
):
249 if 'text' in item
and not TextSubMatch(item
['text'], tag
):
251 if 'contents' in item
and not TextSubMatch(item
['contents'],
252 tag
.renderContents(encoding
=None)):
254 if 'attrs' in item
and item
['attrs']:
255 attrs
= dict(tag
.attrs
)
256 for key
, value
in item
['attrs'].iteritems():
266 elif not TextFullMatch(value
, attrs
[key
]):
268 for rel
, attr
in (('next', 'nextSibling'), ('prev', 'previousSibling')):
269 if rel
in item
and item
[rel
]:
271 next
= getattr(tag
,attr
)
272 if not next
: return False
273 if isinstance(next
, BeautifulSoup
.Tag
): break
274 if not ItemMatch(item
[rel
],next
): return False
275 if item
.has_key('parent') and item
['parent']:
278 if not ItemMatch(item
['parent'],tag
.parent
): return False
281 def DeepMerge(d1
, d2
):
282 for k
in set(d1
.keys() + d2
.keys()):
285 if isinstance(d2
[k
], dict):
286 d1
[k
] = DeepMerge(d1
[k
],d2
[k
])
288 if isinstance(d2
[k
], dict):
289 d1
[k
] = DeepMerge(dict(), d2
[k
])
293 class ETuple(tuple): pass
296 def __init__(self
,**kw
):
298 raise TypeError("'name' argument is required")
299 self
.name
= kw
['name']
300 for n
in ('archive', 'current', 'previous', 'image'):
302 setattr(self
, n
, kw
[n
])
307 def __init__(self
,**kw
):
308 DB
.SetThreadConnection()
310 raise TypeError("'name' argument is required")
311 self
.name
= kw
['name']
313 if kw
['class'] in Classes
:
314 self
.parentclass
= Classes
[kw
['class']]
316 raise TypeError('comic inherits from undefined class "%s"' % kw
['class'])
318 self
.parentclass
=None
319 for n
in ('url_start', 'url_current', 'url_archive', 'url_stop'):
320 setattr(self
, n
, kw
.get(n
,''))
321 for n
in ('archive', 'current', 'previous', 'image'):
322 if self
.parentclass
and getattr(self
.parentclass
, n
):
323 classattr
= getattr(self
.parentclass
, n
)
327 if isinstance(defattr
,dict):
330 myattr
.append(DeepMerge(n
, classattr
))
332 myattr
= (classattr
),
335 if isinstance(kw
[n
], dict):
341 setattr(self
,n
, ETuple(myattr
))
342 if not(self
.previous
or self
.archive
):
343 self
.previous
= ETuple(_DefaultPrevious
)
345 self
.image
= ETuple((_DefaultImage
,))
346 for i
in self
.previous
+ self
.current
+ self
.archive
:
347 if 'name' not in i
: i
['name'] = 'a'
348 if 'map' not in i
: i
['map'] = {'url': _DefaultAnchorURLMap
}
349 if 'url' not in i
['map']: i
['map']['url'] = _DefaultAnchorURLMap
351 if 'name' not in i
: i
['name'] = 'img'
352 if 'map' not in i
: i
['map'] = {'url': _DefaultImageURLMap
}
353 if 'url' not in i
['map']: i
['map']['url'] = _DefaultImageURLMap
354 for match
in ('image', 'current', 'archive', 'previous'):
355 getattr(self
, match
).tags
= self
.GetTags(getattr(self
, match
))
356 if self
.previous
and self
.image
:
357 if self
.previous
.tags
and self
.image
.tags
:
358 tags
= self
.previous
.tags | self
.image
.tags
361 self
.previous
.tags
= tags
362 self
.image
.tags
= tags
365 self
.Massage
= copy
.copy(BeautifulSoup
.BeautifulSoup
.MARKUP_MASSAGE
)
366 self
.Massage
.extend([ (x
, lambda m
: m
.expand(y
)) for x
,y
in kw
['massage'] ])
368 self
.Massage
= BeautifulSoup
.BeautifulSoup
.MARKUP_MASSAGE
370 self
.DBComic
= DB
.Comic
.byName(self
.name
)
373 self
.CurrentTitle
= ''
374 self
.DirName
= NameEnc
.Encode(self
.name
)
375 self
.StoreDir
= os
.path
.join(Conf
.comic_store
, self
.DirName
)
378 def Load(self
, url
, tags
=()):
379 (dat
, inf
) = FetchURL(url
, referer
=self
.WorkingURL
)
380 self
.WorkingURL
= url
384 kw
['markupMassage'] = self
.Massage
385 if inf
.has_key('Content-Type'):
386 m
= re
.search("charset=(.+?)(?:[; ]|$)", inf
['Content-Type'])
388 kw
['fromEncoding'] = m
.group(1)
389 self
.HTMLTree
= BeautifulSoup
.BeautifulSoup(dat
, **kw
)
392 def FetchFile(self
, url
, outfile
):
393 return FetchURL(url
, referer
=self
.WorkingURL
, outfile
=outfile
)
395 def CreateDB(self
, clear
=0):
397 self
.DBComic
= DB
.DoInTransaction(DB
.Comic
, name
=self
.name
)
399 def MapItem(self
, tag
, imap
):
401 for k
, vm
in imap
.iteritems():
404 while vm
and vm
[0] in ('nextSibling','previousSibling','parent','next','previous','contentsindex','childbyname'):
405 if vm
[0] == 'contentsindex':
406 mytag
= mytag
.contents
[vm
[1]]
408 elif vm
[0] == 'childbyname':
409 mytag
= mytag
.find(vm
[1])
412 mytag
= getattr(mytag
,vm
[0])
415 if mytag
.has_key(vm
[1]):
416 v
= ParseEntities(mytag
[vm
[1]])
421 elif vm
[0] == 'text':
424 elif vm
[0] == 'contents':
425 v
= mytag
.renderContents(encoding
=None)
429 v
= re
.subn('#.*$', '', v
)[0]
431 v
= urlparse
.urljoin(self
.WorkingURL
, v
)
433 elif vm
[0] == 'urlunquote':
434 v
= urllib
.unquote(v
)
436 elif isinstance(vm
[0], REType
):
445 elif vm
[1] == 'expand':
452 elif vm
[1] == 'replace':
453 v
= vm
[0].sub(vm
[2],v
)
459 v
= ' '.join(v
.strip().split())
461 v
= re
.subn('#.*$', '', v
)[0]
463 v
= urlparse
.urljoin(self
.WorkingURL
, v
)
468 def GetTags(self
, item
):
469 if isinstance(item
, (tuple, list)):
478 if 're' in item
or not 'name' in item
:
480 tags
= [item
['name']]
481 for rel
in ('prev', 'next', 'parent'):
482 if rel
in item
and item
[rel
]:
483 t
= self
.GetTags(item
[rel
])
489 def FindItems(self
, item
):
490 if isinstance(item
, (tuple, list)):
492 ret
= self
.FindItems(i
)
496 ret
= re
.findall(item
['re'], unicode(self
.HTMLData
))
498 ret
= self
.HTMLTree
.findAll(lambda x
: ItemMatch(item
, x
))
501 if item
['index'] == 'first':
503 elif item
['index'] == 'last':
505 elif not item
['index'] is None:
506 ind
= int(item
['index'])
507 elif item
.has_key('reverse'):
514 ret
= [ self
.MapItem(i
, item
['map']) for i
in ret
]
517 def FindItem(self
, item
):
519 ret
= self
.FindItems(i
)
524 def FindImages(self
):
525 ret
= self
.FindItems(self
.image
)
527 if not 'file' in img
:
528 l
= _DefaultFileRE
.split(img
['url'])
530 img
['file'] = l
[1].replace('/','_')
532 img
['file'] = img
['url'].rsplit('/',1)[-1]
536 def FetchComic(self
, mode
='backlog', exists
=''):
539 images
=self
.FindImages(),
541 if self
.CurrentTitle
:
542 item
['title'] = self
.CurrentTitle
544 for image
in item
['images']:
545 sel
= DB
.Image
.selectBy(
546 comicID
=self
.DBComic
.id,
551 prev
= dict([ (x
, getattr(prevdb
,x
)) for x
in
552 ['title', 'extra', 'url', 'file', 'mime']])
557 if mode
== 'backlog':
559 prevdb
.issue
.url
= self
.WorkingURL
560 DB
.DoInTransaction(fun
)
562 filename
= os
.path
.join(self
.StoreDir
, image
['file'])
564 if not os
.path
.exists(filename
) or exists
== 'refetch':
565 headers
= self
.FetchFile(image
['url'], filename
)
566 if headers
.has_key('Content-Type'):
567 m
= headers
['Content-Type']
569 m
= mimetypes
.guess_type(filename
)[0]
571 m
= Magic
.Path(filename
)
576 for image
in item
['images']:
577 filename
= os
.path
.join(self
.StoreDir
, image
['file'])
578 if os
.path
.exists(filename
):
579 try: os
.remove(filename
)
586 prev
= self
.FindItem(self
.previous
)
587 url
= prev
.get('url','')
590 if url
in [ getattr(self
,x
) for x
in \
591 ('url_current', 'url_start', 'url_archive', \
594 if DB
.Issue
.selectBy(
595 comicID
=self
.DBComic
.id,
599 if url
== self
.url_stop
:
601 self
.Load(url
, self
.image
.tags
)
604 def FetchComics(self
, mode
='backlog', exists
=''):
606 if not self
.DBComic
.issues
.count():
609 self
.Load(self
.url_archive
, self
.archive
.tags
)
610 archive_comics
= self
.FindItems(self
.archive
)
612 url
= self
.DBComic
.issues
.limit(1)[0].url
613 elif mode
== 'backlog':
614 url
= self
.DBComic
.issues
.reversed().limit(1)[0].url
618 for n
in range(len(archive_comics
)):
619 if archive_comics
[n
]['url'] == url
:
621 archive_comics
= archive_comics
[n
+1:]
623 archive_comics
= archive_comics
[:n
]
627 self
.Load(self
.DBComic
.issues
.limit(1)[0].url
, self
.previous
.tags
)
628 if not self
.GetPrev(): return
631 self
.Load(self
.url_start
,self
.current
.tags
)
632 cur
= self
.FindItem(self
.current
)
633 url
= cur
.get('url','')
634 if url
and url
not in [ getattr(self
,x
) for x
in \
635 ('url_current', 'url_start', 'url_archive', \
637 self
.Load(url
, self
.previous
.tags
)
638 self
.CurrentTitle
= cur
.get('title','')
641 self
.Load(self
.url_current
,self
.previous
.tags
)
642 if not os
.path
.isdir(self
.StoreDir
):
643 os
.makedirs(self
.StoreDir
)
644 if mode
== 'archive':
646 self
.DBComic
.UpdateTimeStamp()
647 DB
.Image
.deleteBy(comicID
=self
.DBComic
.id)
648 DB
.Issue
.deleteBy(comicID
=self
.DBComic
.id)
649 DB
.DoInTransaction(fun
)
652 return ("WARNING", "exception raised (%s) while initializing, " \
653 "suggest repeating operation %s later or after resolving " \
654 "the problem" % (e
[1].__class
__.__name
__, mode
), e
)
656 if mode
== 'backlog':
659 for comic
in archive_comics
:
660 self
.Load(comic
['url'], self
.image
.tags
)
661 self
.CurrentTitle
= comic
.get('title','')
662 iss
= self
.FetchComic(mode
, exists
)
665 if mode
== 'backlog':
666 new_comics
.append(iss
)
668 DB
.DoInTransaction(self
.DBFromItem
, iss
, 'old')
671 iss
= self
.FetchComic(mode
, exists
)
674 if mode
== 'backlog':
675 new_comics
.append(iss
)
677 DB
.DoInTransaction(self
.DBFromItem
, iss
, 'old')
678 if not self
.GetPrev(): break
681 if isinstance(e
[1], KeyboardInterrupt):
682 msg
= "keyboard interrupt received"
684 msg
= "exception raised (%s)" % e
[1].__class
__.__name
__
685 if mode
== 'backlog':
686 return ("WARNING", "%s, suggest repeating operation backlog" \
689 return ("WARNING", "%s, suggest repeating operation resume" \
691 if mode
== 'backlog':
694 oldhandler
= signal(SIGINT
, SIG_IGN
)
696 for iss
in reversed(new_comics
):
697 DB
.DoInTransaction(self
.DBFromItem
, iss
)
701 return("ERROR", "exception raised (%s) during DB update, " \
702 "database may be inconsistent" % e
[1].__class
__.__name
__, \
706 def DBFromItem(self
, item
, ext
='new'):
707 if self
.DBComic
.issues
.count():
709 serial
= self
.DBComic
.issues
.reversed().limit(1)[0].serial
+ 1
711 serial
= self
.DBComic
.issues
.limit(1)[0].serial
- 1
714 if len(item
['images']) == 1 and 'title' in item
['images'][0] and \
715 (item
['images'][0]['title'] == item
.get('title') \
716 or not item
.get('title')):
717 item
['title'] = item
['images'][0]['title']
718 item
['images'][0]['title'] = ''
721 title
=item
.get('title',''),
726 for i
in item
['images']:
731 title
=i
.get('title',''),
732 extra
=i
.get('extra',''),
740 classes_source
= resource_stream('Comic', 'data/comics/classes.yml')
741 definitions_source
= resource_stream('Comic', 'data/comics/definitions.yml')
742 Merge(classes_source
, ComicClass
, Classes
)
743 Merge(definitions_source
, ComicDef
, Defs
)
746 """Usage: %s [[operation [comics]] | [[comic_options] [comics]]]
748 -h, --help Display this usage message
749 -l, --list Lists user's selected comics.
750 -L, --list-all Lists all available comics.
751 -A, --add Add comics to user's selected comics.
752 -R, --remove Remove comics from from user's selected comic.
754 If none of the above operations are specified, program will run in fetch
758 -a, --archive The following comics will be fetched in archive mode.
759 -r, --resume The following comics will be fetched in resume mode.
760 -b, --backlog The following comics will be fetched in backlog mode.
761 -f, --refetch Files existing in the comic archive will be re-fetched.
762 -F, --no-refetch Files existing in the comic archive will not be
765 Archive mode will fetch from the current strip back to the earliest available
766 strip. If an archive exists for the comic, its comic strip list will be replaced
767 with the list of new comics downloaded.
769 Resume mode will fetch from the oldest strip in the archive's comic strip list
770 back to the earliest available strip. The new strips will be appended to the
773 Backlog mode will fetch from the current strip back to the newest strip already
774 in the archive's comic strip list. The new strips will be prepended to the
775 existing list upon success.
777 If no comics are specified, all comics selected by the user will be fetched in
778 the specified mode."""
780 def FetchThreaded(comic
, mode
, exists
):
781 DB
.SetThreadConnection()
782 ret
= comic
.FetchComics(mode
=mode
, exists
=exists
)
785 errs
.append((comic
,)+ret
)
790 global comlock
, errs
, exitsignal
793 from Comic
import Conf
794 comlock
= threading
.Lock()
795 exitsignal
= threading
.Event()
802 AppName
= os
.path
.basename(sys
.argv
[0])
803 if '-h' in sys
.argv
or '--help' in sys
.argv
[1:]:
804 print usage
% sys
.argv
[0]
806 if len(sys
.argv
) > 1:
807 if sys
.argv
[1] in ('--list','-l','--list-all','-L'):
808 if len(sys
.argv
) > 2:
809 sys
.exit("--list and --list-all are not valid with other options. Run \"%s --help\" for usage."%(sys
.argv
[0],))
810 if sys
.argv
[1] in ('--list','-l'):
811 comics
= sorted(map(lambda x
: x
.name
,
812 DB
.Comic
.select()), lambda x
,y
:
813 cmp(x
.lower(), y
.lower()))
815 sys
.stderr
.write("No comics in user comic list.\n")
823 elif sys
.argv
[1] in ('-A','--add','-R','--remove'):
825 if sys
.argv
[1] in ('-A','--add'):
826 for c
in sys
.argv
[2:]:
828 sys
.exit("The comic \"%s\" does not exist in the set of available comic definitions."%c)
829 elif not c
in comiclist
:
834 for c
in sys
.argv
[2:]:
836 sys
.exit("The comic \"%s\" does not exist in the set of available comic definitions."%c)
837 elif not c
in comiclist
:
840 DB
.Comic
.deleteBy(name
=c
)
843 for arg
in sys
.argv
[1:]:
844 if arg
in ('--resume','-r'):
846 elif arg
in ('--archive','-a'):
848 elif arg
in ('--backlog','-b'):
850 elif arg
in ('--refetch','-f'):
852 elif arg
in ('--no-refetch','-F'):
856 if Defs
[arg
].DBComic
:
857 fetchlist
.append((Defs
[arg
],mode
,exists
))
859 sys
.exit("The comic \"%s\" is not selected for fetching. Try \"%s -A '%s'\" to add it."%(arg
,AppName
,arg
))
861 sys
.exit("The comic \"%s\". does not exist. Try \"%s -L\" to list available comics."%(arg
,AppName
))
863 s
= DB
.Comic
.select()
865 fetchlist
= map(lambda x
: (Defs
[x
.name
],mode
,exists
), s
)
867 sys
.exit("No comics specified and none in user's set of selected comics. Try \"%s -h\" to see more options."%AppName
)
872 while len(threading
.enumerate()) > Conf
.threads
:
875 t
= threading
.Thread(target
=FetchThreaded
, name
=i
[0].name
, args
=i
)
877 except KeyboardInterrupt:
878 HaltLock
.acquire(False)
879 while len(threading
.enumerate()) > 1:
881 while len(threading
.enumerate()) > 1:
884 except KeyboardInterrupt:
885 HaltLock
.acquire(False)
888 ret
= c
[0].FetchComics(c
[1],c
[2])
890 errs
.append((c
[0],) + ret
)
892 print "%s: (%s) %s" % (e
[1],e
[0].name
,e
[2])
893 traceback
.print_exception(e
[3][0],e
[3][1],e
[3][2])