objavi/twiki_wrapper.py

   1 """Fetch stuff from remote twiki instances"""
   2
   3 import os, sys, time, re
   4 import tempfile
   5
   6 from objavi import config
   7 from objavi.book_utils import log, guess_lang, guess_text_dir, make_book_name, decode_html_entities
   8 from urllib2 import urlopen
   9 from urlparse import urlsplit
  10 from booki.bookizip import add_metadata, BookiZip
  11
  12 from objavi.xhtml_utils import BaseChapter, ImageCache
  13
  14 #from pprint import pformat
  15
  16 import lxml.html
  17 from lxml import etree
  18
  19 CHAPTER_TEMPLATE = '''<html dir="%(dir)s">
  20 <head>
  21 <title>%(title)s</title>
  22 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
  23 </head>
  24 <body>
  25 %(text)s
  26 </body>
  27 </html>
  28 '''
  29
  30 def get_book_list(server):
  31     """Ask the server for a list of books.  Floss Manual TWikis keep such a list at
  32     /bin/view/TWiki/WebLeftBarWebsList?skin=text but it needs a bit of processing
  33
  34     If BOOK_LIST_CACHE is non-zero, the book list won't be re-fetched
  35     in that many seconds, rather it will be read from disk.
  36     """
  37     if config.BOOK_LIST_CACHE:
  38         cache_name = os.path.join(config.CACHE_DIR, '%s.booklist' % server)
  39         if (os.path.exists(cache_name) and
  40             os.stat(cache_name).st_mtime + config.BOOK_LIST_CACHE > time.time()):
  41             f = open(cache_name)
  42             s = f.read()
  43             f.close()
  44             return s.split()
  45
  46     url = config.CHAPTER_URL % (server, 'TWiki', 'WebLeftBarWebsList')
  47     #url = 'http://%s/bin/view/TWiki/WebLeftBarWebsList?skin=text' % server
  48     #XXX should use lxml
  49     log('getting booklist: %s' % url)
  50     f = urlopen(url)
  51     s = f.read()
  52     f.close()
  53     items = sorted(x for x in re.findall(r'/bin/view/([\w/]+)/WebHome', s)
  54                    if x not in config.IGNORABLE_TWIKI_BOOKS)
  55     if config.BOOK_LIST_CACHE:
  56         f = open(cache_name, 'w')
  57         f.write('\n'.join(items))
  58         f.close()
  59     return items
  60
  61
  62 def toc_iterator(server, book):
  63     """TOC.txt has 3 lines per chapter.  Fetch them and yield them in
  64     triples.
  65     """
  66     url = config.TOC_URL % (server, book)
  67     log('getting TOC: %s' % url)
  68     f = urlopen(url)
  69     encoding = config.SERVER_DEFAULTS[server]['toc-encoding']
  70     while True:
  71         try:
  72             if encoding is not None:
  73                 yield TocItem(f.next().decode(encoding).strip().encode('utf-8'),
  74                               f.next().decode(encoding).strip().encode('utf-8'),
  75                               f.next().decode(encoding).strip().encode('utf-8'))
  76             else:
  77                 yield TocItem(f.next().strip(),
  78                               f.next().strip(),
  79                               f.next().strip())
  80         except StopIteration:
  81             break
  82     f.close()
  83
  84
  85
  86
  87
  88 class TocItem(object):
  89     """This makes sense of the tuples from TOC.txt files"""
  90     def __init__(self, status, chapter, title):
  91         # status is
  92         #  0 - section heading with no chapter
  93         #  1 - chapter heading
  94         #  2 - book title
  95         #
  96         # chapter is twiki name of the chapter
  97         # title is a human readable name of the chapter.
  98         self.status = decode_html_entities(status)
  99         self.chapter = decode_html_entities(chapter)
 100         self.title = decode_html_entities(title)
 101
 102     def is_chapter(self):
 103         return self.status == '1'
 104
 105     def is_section(self):
 106         return self.status == '0'
 107
 108     def is_title(self):
 109         return self.status == '2'
 110
 111     def __str__(self):
 112         return '<toc: %s>' %  ', '.join('%s: %s' % x for x in self.__dict__.iteritems())
 113
 114     def as_zipitem(self):
 115         item =  {
 116             "title": self.title,
 117             "url": self.chapter + '.html',
 118             'type': 'chapter'
 119             }
 120         if self.is_section():
 121             item["url"] = None
 122             item['type'] = 'booki-section'
 123             item['children'] = []
 124         return item
 125
 126
 127 class TWikiBook(object):
 128     credits = None
 129     metadata = None
 130     def __init__(self, book, server, bookname=None):
 131         if bookname is None:
 132             bookname = make_book_name(book, server, '.zip')
 133         log("*** Extracting TWiki book %s ***" % bookname)
 134         self.bookname = bookname
 135         self.book = book
 136         self.server = server
 137         self.workdir = tempfile.mkdtemp(prefix=bookname, dir=config.TMPDIR)
 138         os.chmod(self.workdir, 0755)
 139         #probable text direction
 140         self.dir = guess_text_dir(self.server, self.book)
 141
 142     def filepath(self, fn):
 143         return os.path.join(self.workdir, fn)
 144
 145     def _fetch_metadata(self, force=False):
 146         """Get information about a twiki book (as much as is easy and
 147         useful).  If force is False (default) then it will not be
 148         reloaded if it has already been set.
 149         """
 150         if self.metadata is not None and not force:
 151             log("not reloading metadata")
 152             return
 153         meta = {
 154             config.DC: {
 155                 "publisher": {
 156                     "": ["FLOSS Manuals http://flossmanuals.net"]
 157                     },
 158                 'identifier': {
 159                     "": ['http://%s/epub/%s/%s' %
 160                          (self.server, self.book, time.strftime('%Y.%m.%d-%H.%M.%S'))]
 161                     },
 162                 'creator': {
 163                     "": ['The Contributors']
 164                     },
 165                 'date': {
 166                     "": [time.strftime('%Y-%m-%d')]
 167                     },
 168                 'title': {
 169                     "": [self.book]
 170                     },
 171                 },
 172             config.FM: {
 173                 'server': {"": [self.server]},
 174                 'book': {"": [self.book]},
 175                 }
 176             }
 177
 178         lang = guess_lang(self.server, self.book)
 179         self.dir = guess_text_dir(self.server, self.book)
 180         #log(self.server, self.book, lang, self.dir)
 181         if lang is not None:
 182             add_metadata(meta, 'language', lang)
 183         if self.dir is not None:
 184             add_metadata(meta, 'dir', self.dir, ns=config.FM)
 185
 186         spine = []
 187         toc = []
 188         section = toc
 189         waiting_for_url = []
 190
 191         for t in toc_iterator(self.server, self.book):
 192             #log(t)
 193             if t.is_title():
 194                 meta[config.DC]['title'][''] = [t.title]
 195                 continue
 196
 197             item = t.as_zipitem()
 198             if item['url'] is None:
 199                 waiting_for_url.append(item)
 200             elif waiting_for_url:
 201                 for wt in waiting_for_url:
 202                     wt['url'] = item['url']
 203                 waiting_for_url = []
 204
 205             if t.is_chapter():
 206                 spine.append(t.chapter)
 207                 section.append(item)
 208
 209             elif t.is_section():
 210                 section = item['children']
 211                 toc.append(item)
 212
 213         self.metadata = {
 214             'version': 1,
 215             'metadata': meta,
 216             'TOC': toc,
 217             'spine': spine,
 218             'manifest': {},
 219         }
 220
 221         self._parse_credits()
 222         for c in self.contributors:
 223             add_metadata(meta, 'contributor', c)
 224
 225
 226
 227     def make_bookizip(self, filename=None, use_cache=False):
 228         """Extract all chapters, images, and metadata, and zip it all
 229         up for conversion to epub.
 230
 231         If cache is true, images that have been fetched on previous
 232         runs will be reused.
 233         """
 234         self._fetch_metadata()
 235         if filename is None:
 236             filename = self.filepath(self.bookname)
 237         bz = BookiZip(filename, self.metadata)
 238
 239         all_images = set()
 240         for chapter in self.metadata['spine']:
 241             contents = self.get_chapter_html(chapter, wrapped=True)
 242             c = TWikiChapter(self.server, self.book, chapter, contents,
 243                              use_cache=use_cache)
 244             images = c.localise_links()
 245             c.fix_bad_structure()
 246             all_images.update(images)
 247             #log(chapter, self.credits)
 248             bz.add_to_package(chapter, chapter + '.html',
 249                               c.as_html(), **self.credits.get(chapter, {}))
 250
 251         # Add images afterwards, to sift out duplicates
 252         for image in all_images:
 253             imgdata = c.image_cache.read_local_url(image)
 254             bz.add_to_package(image, image, imgdata) #XXX img ownership: where is it?
 255
 256         bz.finish()
 257         return bz.filename
 258
 259     def get_chapter_html(self, chapter, wrapped=False):
 260         url = config.CHAPTER_URL % (self.server, self.book, chapter)
 261         log('getting chapter: %s' % url)
 262         f = urlopen(url)
 263         html = f.read()
 264         f.close()
 265         if wrapped:
 266             html = CHAPTER_TEMPLATE % {
 267                 'title': '%s: %s' % (self.book, chapter),
 268                 'text': html,
 269                 'dir': self.dir
 270             }
 271         return html
 272
 273     def _parse_credits(self, force=False):
 274         # open the Credits chapter that has a list of authors for each chapter.
 275         # each chapter is listed thus (linebreaks added):
 276         #   <i>CHAPTER TITLE</i><br/>&copy; First Author 2007<br/>
 277         #   Modifications:<br/>Second Author 2007, 2008<br/>
 278         #   Third Author 2008<br/>Fourth Author 2008<br/><hr/>
 279         #
 280         # where "CHAPTER TITLE" is as appears in TOC.txt, and "X
 281         # Author" are the names TWiki has for authors.  So the thing
 282         # to do is look for the <i> tags and match them to the toc.
 283         #
 284         # the chapter title is not guaranteed unique (but usually is).
 285         if self.credits is not None and not force:
 286             log("not reloading metadata")
 287             return
 288
 289         self.credits = {}
 290         self.contributors = set()
 291         self.titles = []
 292
 293         credits_html = self.get_chapter_html('Credits', wrapped=True)
 294         try:
 295             parser = lxml.html.HTMLParser(encoding='utf-8')
 296             tree = lxml.html.document_fromstring(credits_html, parser=parser)
 297         except UnicodeDecodeError, e:
 298             log("book isn't unicode! (%s)" %(e,))
 299             encoding = config.SERVER_DEFAULTS[self.server]['toc-encoding']
 300             parser = lxml.html.HTMLParser(encoding=encoding)
 301             tree = lxml.html.document_fromstring(credits_html, parser=parser)
 302
 303         name_re = re.compile(r'^\s*(.+?) ((?:\d{4},? ?)+)$')
 304         spine_iter = iter(self.metadata['spine'])
 305
 306         try:
 307             for e in tree.iter('i'):
 308                 if e.tail or e.getnext().tag != 'br':
 309                     continue
 310                 chapter = spine_iter.next()
 311                 try:
 312                     title = e.text
 313                 except UnicodeDecodeError:
 314                     #The chapter's name has been encoded in iso-8859-1 and
 315                     #inserted into the utf-8 page with no transcoding.
 316                     title = ('Something to the effect of %r in a utf-8'
 317                              ' incompatible encoding') % chapter
 318                 log("chapter %r title %r" % (chapter, title))
 319                 contributors = []
 320                 rightsholders = []
 321                 while True:
 322                     e = e.getnext()
 323                     if not e.tail or e.tag != 'br':
 324                         break
 325                     #log(e.tail)
 326                     if e.tail.startswith(u'\u00a9'): # \u00a9 == copyright symbol
 327                         m = name_re.match(e.tail[1:])
 328                         author, dates = m.groups()
 329                         rightsholders.append(author)
 330                         contributors.append(author)
 331                     else:
 332                         m = name_re.match(e.tail)
 333                         if m is not None:
 334                             author, dates = m.groups()
 335                             contributors.append(author)
 336
 337                 self.credits[chapter] = {
 338                     "contributors":contributors,
 339                     "rightsholders": rightsholders,
 340                     }
 341                 self.titles.append(title)
 342                 self.contributors.update(contributors)
 343
 344         except StopIteration:
 345             log('Apparently run out of chapters on title %s!' % title)
 346
 347
 348
 349
 350 class TWikiChapter(BaseChapter):
 351     image_cache = ImageCache()
 352
 353     def __init__(self, server, book, chapter_name, html, use_cache=False,
 354                  cache_dir=None):
 355         self.server = server
 356         self.book = book
 357         self.name = chapter_name
 358         self.use_cache = use_cache
 359         if cache_dir:
 360             self.image_cache = ImageCache(cache_dir)
 361         self._loadtree(html)
 362
 363     def localise_links(self):
 364         """Find image links, convert them to local links, and fetch
 365         the images from the net so the local links work"""
 366         images = []
 367         def localise(oldlink):
 368             fragments = urlsplit(oldlink)
 369             if '.' not in fragments.path:
 370                 log('ignoring %s' % oldlink)
 371                 return oldlink
 372             base, ext = fragments.path.rsplit('.', 1)
 373             ext = ext.lower()
 374             if (not fragments.scheme.startswith('http') or
 375                 (fragments.netloc != self.server and 'flossmanuals.net' not in fragments.netloc) or
 376                 ext not in ('png', 'gif', 'jpg', 'jpeg', 'svg', 'css', 'js') or
 377                 '/pub/' not in base
 378                 ):
 379                 log('ignoring %s' % oldlink)
 380                 return oldlink
 381
 382             newlink = self.image_cache.fetch_if_necessary(oldlink, use_cache=self.use_cache)
 383             if newlink is not None:
 384                 images.append(newlink)
 385                 return newlink
 386             log("can't do anything for %s -- why?" % (oldlink,))
 387             return oldlink
 388
 389         self.tree.rewrite_links(localise, base_href=('http://%s/bin/view/%s/%s' %
 390                                                      (self.server, self.book, self.name)))
 391         return images
 392
 393
 394 #XXX almost certainly broken and out of date!
 395 class Author(object):
 396     def __init__(self, name, email):
 397         self.name = name
 398         self.email = email
 399
 400 class ImportedChapter(TWikiChapter):
 401     """Used for git import"""
 402     def __init__(self, lang, book, chapter_name, text, author, email, date, server=None,
 403                  use_cache=False, cache_dir=None):
 404         self.lang = lang
 405         self.book = book
 406         self.name = chapter_name
 407         self.author = Author(author, email)
 408         self.date = date
 409         if server is None:
 410             server = '%s.flossmanuals.net' % lang
 411         self.server = server
 412         self.use_cache = use_cache
 413         if cache_dir:
 414             self.image_cache = ImageCache(cache_dir)
 415         #XXX is text html-wrapped?
 416         self._loadtree(text)
 417
 418     def as_twikitext(self):
 419         """Get the twiki-style guts of the chapter from the tree"""
 420         text = etree.tostring(self.tree.find('body'), method='html')
 421         text = re.sub(r'^.*?<body.*?>\s*', '', text)
 422         text = re.sub(r'\s*</body>.*$', '\n', text)
 423         return text