objavi/twiki_wrapper.py

   1 """Fetch stuff from remote twiki instances"""
   2
   3 import os, sys, time, re
   4 import tempfile
   5
   6 from objavi import config
   7 from objavi.book_utils import log, guess_lang, guess_text_dir, make_book_name
   8 from urllib2 import urlopen
   9 from urlparse import urlsplit
  10 from booki.bookizip import add_metadata, BookiZip
  11
  12 from objavi.xhtml_utils import BaseChapter, ImageCache
  13
  14 #from pprint import pformat
  15
  16 import lxml.html
  17 from lxml import etree
  18
  19 CHAPTER_TEMPLATE = '''<html dir="%(dir)s">
  20 <head>
  21 <title>%(title)s</title>
  22 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
  23 </head>
  24 <body>
  25 %(text)s
  26 </body>
  27 </html>
  28 '''
  29
  30 def get_book_list(server):
  31     """Ask the server for a list of books.  Floss Manual TWikis keep such a list at
  32     /bin/view/TWiki/WebLeftBarWebsList?skin=text but it needs a bit of processing
  33
  34     If BOOK_LIST_CACHE is non-zero, the book list won't be re-fetched
  35     in that many seconds, rather it will be read from disk.
  36     """
  37     if config.BOOK_LIST_CACHE:
  38        cache_name = os.path.join(config.BOOK_LIST_CACHE_DIR, '%s.booklist' % server)
  39        if (os.path.exists(cache_name) and
  40            os.stat(cache_name).st_mtime + config.BOOK_LIST_CACHE > time.time()):
  41            f = open(cache_name)
  42            s = f.read()
  43            f.close()
  44            return s.split()
  45
  46     url = config.CHAPTER_URL % (server, 'TWiki', 'WebLeftBarWebsList')
  47     #url = 'http://%s/bin/view/TWiki/WebLeftBarWebsList?skin=text' % server
  48     #XXX should use lxml
  49     log('getting booklist: %s' % url)
  50     f = urlopen(url)
  51     s = f.read()
  52     f.close()
  53     items = sorted(x for x in re.findall(r'/bin/view/([\w/]+)/WebHome', s)
  54                    if x not in config.IGNORABLE_TWIKI_BOOKS)
  55     if config.BOOK_LIST_CACHE:
  56         f = open(cache_name, 'w')
  57         f.write('\n'.join(items))
  58         f.close()
  59     return items
  60
  61
  62 def toc_iterator(server, book):
  63     """TOC.txt has 3 lines per chapter.  Fetch them and yield them in
  64     triples.
  65     """
  66     url = config.TOC_URL % (server, book)
  67     log('getting TOC: %s' % url)
  68     f = urlopen(url)
  69     encoding = config.SERVER_DEFAULTS[server]['toc-encoding']
  70     while True:
  71         try:
  72             if encoding is not None:
  73                 yield TocItem(f.next().decode(encoding).strip().encode('utf-8'),
  74                               f.next().decode(encoding).strip().encode('utf-8'),
  75                               f.next().decode(encoding).strip().encode('utf-8'))
  76             else:
  77                 yield TocItem(f.next().strip(),
  78                               f.next().strip(),
  79                               f.next().strip())
  80         except StopIteration:
  81             break
  82     f.close()
  83
  84
  85
  86
  87
  88 class TocItem(object):
  89     """This makes sense of the tuples from TOC.txt files"""
  90     def __init__(self, status, chapter, title):
  91         # status is
  92         #  0 - section heading with no chapter
  93         #  1 - chapter heading
  94         #  2 - book title
  95         #
  96         # chapter is twiki name of the chapter
  97         # title is a human readable name of the chapter.
  98         self.status = status
  99         self.chapter = chapter
 100         self.title = title
 101
 102     def is_chapter(self):
 103         return self.status == '1'
 104
 105     def is_section(self):
 106         return self.status == '0'
 107
 108     def is_title(self):
 109         return self.status == '2'
 110
 111     def __str__(self):
 112         return '<toc: %s>' %  ', '.join('%s: %s' % x for x in self.__dict__.iteritems())
 113
 114     def as_zipitem(self):
 115         item =  {
 116             "title": self.title,
 117             "url": self.chapter + '.html',
 118             'type': 'chapter'
 119             }
 120         if self.is_section():
 121             item["url"] = None
 122             item['type'] = 'booki-section'
 123             item['children'] = []
 124         return item
 125
 126
 127 class TWikiBook(object):
 128     credits = None
 129     metadata = None
 130     def __init__(self, book, server, bookname=None):
 131         if bookname is None:
 132             bookname = make_book_name(book, server, '.zip')
 133         log("*** Extracting TWiki book %s ***" % bookname)
 134         self.bookname = bookname
 135         self.book = book
 136         self.server = server
 137         self.workdir = tempfile.mkdtemp(prefix=bookname, dir=config.TMPDIR)
 138         os.chmod(self.workdir, 0755)
 139         #probable text direction
 140         self.dir = guess_text_dir(self.server, self.book)
 141
 142     def filepath(self, fn):
 143         return os.path.join(self.workdir, fn)
 144
 145     def _fetch_metadata(self, force=False):
 146         """Get information about a twiki book (as much as is easy and
 147         useful).  If force is False (default) then it will not be
 148         reloaded if it has already been set.
 149         """
 150         if self.metadata is not None and not force:
 151             log("not reloading metadata")
 152             return
 153         meta = {
 154             config.DC: {
 155                 "publisher": {
 156                     "": ["FLOSS Manuals http://flossmanuals.net"]
 157                     },
 158                 'identifier': {
 159                     "": ['http://%s/epub/%s/%s' %
 160                          (self.server, self.book, time.strftime('%Y.%m.%d-%H.%M.%S'))]
 161                     },
 162                 'creator': {
 163                     "": ['The Contributors']
 164                     },
 165                 'date': {
 166                     "": [time.strftime('%Y-%m-%d')]
 167                     },
 168                 'title': {
 169                     "": [self.book]
 170                     },
 171                 },
 172             config.FM: {
 173                 'server': {"": [self.server]},
 174                 'book': {"": [self.book]},
 175                 }
 176             }
 177
 178         lang = guess_lang(self.server, self.book)
 179         self.dir = guess_text_dir(self.server, self.book)
 180         #log(self.server, self.book, lang, self.dir)
 181         if lang is not None:
 182             add_metadata(meta, 'language', lang)
 183         if self.dir is not None:
 184             add_metadata(meta, 'dir', self.dir, ns=config.FM)
 185
 186         spine = []
 187         toc = []
 188         section = toc
 189         waiting_for_url = []
 190
 191         for t in toc_iterator(self.server, self.book):
 192             #log(t)
 193             if t.is_title():
 194                 meta[config.DC]['title'][''] = [t.title]
 195                 continue
 196
 197             item = t.as_zipitem()
 198             if item['url'] is None:
 199                 waiting_for_url.append(item)
 200             elif waiting_for_url:
 201                 for wt in waiting_for_url:
 202                     wt['url'] = item['url']
 203                 waiting_for_url = []
 204
 205             if t.is_chapter():
 206                 spine.append(t.chapter)
 207                 section.append(item)
 208
 209             elif t.is_section():
 210                 section = item['children']
 211                 toc.append(item)
 212
 213         self.metadata = {
 214             'version': 1,
 215             'metadata': meta,
 216             'TOC': toc,
 217             'spine': spine,
 218             'manifest': {},
 219         }
 220
 221         self._parse_credits()
 222         for c in self.contributors:
 223             add_metadata(meta, 'contributor', c)
 224
 225
 226
 227     def make_bookizip(self, filename=None, use_cache=False):
 228         """Extract all chapters, images, and metadata, and zip it all
 229         up for conversion to epub.
 230
 231         If cache is true, images that have been fetched on previous
 232         runs will be reused.
 233         """
 234         self._fetch_metadata()
 235         if filename is None:
 236             filename = self.filepath(self.bookname)
 237         bz = BookiZip(filename, self.metadata)
 238
 239         all_images = set()
 240         for chapter in self.metadata['spine']:
 241             contents = self.get_chapter_html(chapter, wrapped=True)
 242             c = TWikiChapter(self.server, self.book, chapter, contents,
 243                              use_cache=use_cache)
 244             images = c.localise_links()
 245             all_images.update(images)
 246             #log(chapter, self.credits)
 247             bz.add_to_package(chapter, chapter + '.html',
 248                               c.as_html(), **self.credits.get(chapter, {}))
 249
 250         # Add images afterwards, to sift out duplicates
 251         for image in all_images:
 252             imgdata = c.image_cache.read_local_url(image)
 253             bz.add_to_package(image, image, imgdata) #XXX img ownership: where is it?
 254
 255         bz.finish()
 256         return bz.filename
 257
 258     def get_chapter_html(self, chapter, wrapped=False):
 259         url = config.CHAPTER_URL % (self.server, self.book, chapter)
 260         log('getting chapter: %s' % url)
 261         f = urlopen(url)
 262         html = f.read()
 263         f.close()
 264         if wrapped:
 265             html = CHAPTER_TEMPLATE % {
 266                 'title': '%s: %s' % (self.book, chapter),
 267                 'text': html,
 268                 'dir': self.dir
 269             }
 270         return html
 271
 272     def _parse_credits(self, force=False):
 273         # open the Credits chapter that has a list of authors for each chapter.
 274         # each chapter is listed thus (linebreaks added):
 275         #   <i>CHAPTER TITLE</i><br/>&copy; First Author 2007<br/>
 276         #   Modifications:<br/>Second Author 2007, 2008<br/>
 277         #   Third Author 2008<br/>Fourth Author 2008<br/><hr/>
 278         #
 279         # where "CHAPTER TITLE" is as appears in TOC.txt, and "X
 280         # Author" are the names TWiki has for authors.  So the thing
 281         # to do is look for the <i> tags and match them to the toc.
 282         #
 283         # the chapter title is not guaranteed unique (but usually is).
 284         if self.credits is not None and not force:
 285             log("not reloading metadata")
 286             return
 287
 288         self.credits = {}
 289         self.contributors = set()
 290         self.titles = []
 291
 292         credits_html = self.get_chapter_html('Credits', wrapped=True)
 293         try:
 294             parser = lxml.html.HTMLParser(encoding='utf-8')
 295             tree = lxml.html.document_fromstring(credits_html, parser=parser)
 296         except UnicodeDecodeError, e:
 297             log("book isn't unicode! (%s)" %(e,))
 298             encoding = config.SERVER_DEFAULTS[self.server]['toc-encoding']
 299             parser = lxml.html.HTMLParser(encoding=encoding)
 300             tree = lxml.html.document_fromstring(credits_html, parser=parser)
 301
 302         name_re = re.compile(r'^\s*(.+?) ((?:\d{4},? ?)+)$')
 303         spine_iter = iter(self.metadata['spine'])
 304
 305         try:
 306             for e in tree.iter('i'):
 307                 if e.tail or e.getnext().tag != 'br':
 308                     continue
 309                 title = e.text
 310                 chapter = spine_iter.next()
 311                 log("chapter %r title %r" % (chapter, title))
 312                 contributors = []
 313                 rightsholders = []
 314                 while True:
 315                     e = e.getnext()
 316                     if not e.tail or e.tag != 'br':
 317                         break
 318                     #log(e.tail)
 319                     if e.tail.startswith(u'\u00a9'): # \u00a9 == copyright symbol
 320                         m = name_re.match(e.tail[1:])
 321                         author, dates = m.groups()
 322                         rightsholders.append(author)
 323                         contributors.append(author)
 324                     else:
 325                         m = name_re.match(e.tail)
 326                         if m is not None:
 327                             author, dates = m.groups()
 328                             contributors.append(author)
 329
 330                 self.credits[chapter] = {
 331                     "contributors":contributors,
 332                     "rightsholders": rightsholders,
 333                     }
 334                 self.titles.append(title)
 335                 self.contributors.update(contributors)
 336
 337         except StopIteration:
 338             log('Apparently run out of chapters on title %s!' % title)
 339
 340
 341
 342
 343 class TWikiChapter(BaseChapter):
 344     image_cache = ImageCache()
 345
 346     def __init__(self, server, book, chapter_name, html, use_cache=False,
 347                  cache_dir=None):
 348         self.server = server
 349         self.book = book
 350         self.name = chapter_name
 351         self.use_cache = use_cache
 352         if cache_dir:
 353             self.image_cache = ImageCache(cache_dir)
 354         self._loadtree(html)
 355
 356     def localise_links(self):
 357         """Find image links, convert them to local links, and fetch
 358         the images from the net so the local links work"""
 359         images = []
 360         def localise(oldlink):
 361             fragments = urlsplit(oldlink)
 362             if '.' not in fragments.path:
 363                 log('ignoring %s' % oldlink)
 364                 return oldlink
 365             base, ext = fragments.path.rsplit('.', 1)
 366             ext = ext.lower()
 367             if (not fragments.scheme.startswith('http') or
 368                 (fragments.netloc != self.server and 'flossmanuals.net' not in fragments.netloc) or
 369                 ext not in ('png', 'gif', 'jpg', 'jpeg', 'svg', 'css', 'js') or
 370                 '/pub/' not in base
 371                 ):
 372                 log('ignoring %s' % oldlink)
 373                 return oldlink
 374
 375             newlink = self.image_cache.fetch_if_necessary(oldlink, use_cache=self.use_cache)
 376             if newlink is not None:
 377                 images.append(newlink)
 378                 return newlink
 379             log("can't do anything for %s -- why?" % (oldlink,))
 380             return oldlink
 381
 382         self.tree.rewrite_links(localise, base_href=('http://%s/bin/view/%s/%s' %
 383                                                      (self.server, self.book, self.name)))
 384         return images
 385
 386
 387 #XXX almost certainly broken and out of date!
 388 class Author(object):
 389     def __init__(self, name, email):
 390         self.name = name
 391         self.email = email
 392
 393 class ImportedChapter(TWikiChapter):
 394     """Used for git import"""
 395     def __init__(self, lang, book, chapter_name, text, author, email, date, server=None,
 396                  use_cache=False, cache_dir=None):
 397         self.lang = lang
 398         self.book = book
 399         self.name = chapter_name
 400         self.author = Author(author, email)
 401         self.date = date
 402         if server is None:
 403             server = '%s.flossmanuals.net' % lang
 404         self.server = server
 405         self.use_cache = use_cache
 406         if cache_dir:
 407             self.image_cache = ImageCache(cache_dir)
 408         #XXX is text html-wrapped?
 409         self._loadtree(text)
 410
 411     def as_twikitext(self):
 412         """Get the twiki-style guts of the chapter from the tree"""
 413         text = etree.tostring(self.tree.find('body'), method='html')
 414         text = re.sub(r'^.*?<body.*?>\s*', '', text)
 415         text = re.sub(r'\s*</body>.*$', '\n', text)
 416         return text