Fix various errors detected by pyflakes
[objavi2.git] / objavi / twiki_wrapper.py
blob604af580d83630affd7aaddf8b0a732c1145cab6
1 """Fetch stuff from remote twiki instances"""
3 import os, sys, time, re
4 import tempfile
6 from objavi import config
7 from objavi.book_utils import log, guess_lang, guess_text_dir, make_book_name
8 from urllib2 import urlopen
9 from urlparse import urlsplit
10 from booki.bookizip import add_metadata, BookiZip
12 from objavi.xhtml_utils import BaseChapter, ImageCache
14 #from pprint import pformat
16 import lxml.html
17 from lxml import etree
19 CHAPTER_TEMPLATE = '''<html dir="%(dir)s">
20 <head>
21 <title>%(title)s</title>
22 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
23 </head>
24 <body>
25 %(text)s
26 </body>
27 </html>
28 '''
30 def get_book_list(server):
31 """Ask the server for a list of books. Floss Manual TWikis keep such a list at
32 /bin/view/TWiki/WebLeftBarWebsList?skin=text but it needs a bit of processing
34 If BOOK_LIST_CACHE is non-zero, the book list won't be re-fetched
35 in that many seconds, rather it will be read from disk.
36 """
37 if config.BOOK_LIST_CACHE:
38 cache_name = os.path.join(config.BOOK_LIST_CACHE_DIR, '%s.booklist' % server)
39 if (os.path.exists(cache_name) and
40 os.stat(cache_name).st_mtime + config.BOOK_LIST_CACHE > time.time()):
41 f = open(cache_name)
42 s = f.read()
43 f.close()
44 return s.split()
46 url = config.CHAPTER_URL % (server, 'TWiki', 'WebLeftBarWebsList')
47 #url = 'http://%s/bin/view/TWiki/WebLeftBarWebsList?skin=text' % server
48 #XXX should use lxml
49 log('getting booklist: %s' % url)
50 f = urlopen(url)
51 s = f.read()
52 f.close()
53 items = sorted(x for x in re.findall(r'/bin/view/([\w/]+)/WebHome', s)
54 if x not in config.IGNORABLE_TWIKI_BOOKS)
55 if config.BOOK_LIST_CACHE:
56 f = open(cache_name, 'w')
57 f.write('\n'.join(items))
58 f.close()
59 return items
62 def toc_iterator(server, book):
63 """TOC.txt has 3 lines per chapter. Fetch them and yield them in
64 triples.
65 """
66 url = config.TOC_URL % (server, book)
67 log('getting TOC: %s' % url)
68 f = urlopen(url)
69 encoding = config.SERVER_DEFAULTS[server]['toc-encoding']
70 while True:
71 try:
72 if encoding is not None:
73 yield TocItem(f.next().decode(encoding).strip().encode('utf-8'),
74 f.next().decode(encoding).strip().encode('utf-8'),
75 f.next().decode(encoding).strip().encode('utf-8'))
76 else:
77 yield TocItem(f.next().strip(),
78 f.next().strip(),
79 f.next().strip())
80 except StopIteration:
81 break
82 f.close()
88 class TocItem(object):
89 """This makes sense of the tuples from TOC.txt files"""
90 def __init__(self, status, chapter, title):
91 # status is
92 # 0 - section heading with no chapter
93 # 1 - chapter heading
94 # 2 - book title
96 # chapter is twiki name of the chapter
97 # title is a human readable name of the chapter.
98 self.status = status
99 self.chapter = chapter
100 self.title = title
102 def is_chapter(self):
103 return self.status == '1'
105 def is_section(self):
106 return self.status == '0'
108 def is_title(self):
109 return self.status == '2'
111 def __str__(self):
112 return '<toc: %s>' % ', '.join('%s: %s' % x for x in self.__dict__.iteritems())
114 def as_zipitem(self):
115 item = {
116 "title": self.title,
117 "url": self.chapter + '.html',
118 'type': 'chapter'
120 if self.is_section():
121 item["url"] = None
122 item['type'] = 'booki-section'
123 item['children'] = []
124 return item
127 class TWikiBook(object):
128 credits = None
129 metadata = None
130 def __init__(self, book, server, bookname=None):
131 if bookname is None:
132 bookname = make_book_name(book, server, '.zip')
133 log("*** Extracting TWiki book %s ***" % bookname)
134 self.bookname = bookname
135 self.book = book
136 self.server = server
137 self.workdir = tempfile.mkdtemp(prefix=bookname, dir=config.TMPDIR)
138 os.chmod(self.workdir, 0755)
139 #probable text direction
140 self.dir = guess_text_dir(self.server, self.book)
142 def filepath(self, fn):
143 return os.path.join(self.workdir, fn)
145 def _fetch_metadata(self, force=False):
146 """Get information about a twiki book (as much as is easy and
147 useful). If force is False (default) then it will not be
148 reloaded if it has already been set.
150 if self.metadata is not None and not force:
151 log("not reloading metadata")
152 return
153 meta = {
154 config.DC: {
155 "publisher": {
156 "": ["FLOSS Manuals http://flossmanuals.net"]
158 'identifier': {
159 "": ['http://%s/epub/%s/%s' %
160 (self.server, self.book, time.strftime('%Y.%m.%d-%H.%M.%S'))]
162 'creator': {
163 "": ['The Contributors']
165 'date': {
166 "": [time.strftime('%Y-%m-%d')]
168 'title': {
169 "": [self.book]
172 config.FM: {
173 'server': {"": [self.server]},
174 'book': {"": [self.book]},
178 lang = guess_lang(self.server, self.book)
179 self.dir = guess_text_dir(self.server, self.book)
180 #log(self.server, self.book, lang, self.dir)
181 if lang is not None:
182 add_metadata(meta, 'language', lang)
183 if self.dir is not None:
184 add_metadata(meta, 'dir', self.dir, ns=config.FM)
186 spine = []
187 toc = []
188 section = toc
189 waiting_for_url = []
191 for t in toc_iterator(self.server, self.book):
192 #log(t)
193 if t.is_title():
194 meta[config.DC]['title'][''] = [t.title]
195 continue
197 item = t.as_zipitem()
198 if item['url'] is None:
199 waiting_for_url.append(item)
200 elif waiting_for_url:
201 for wt in waiting_for_url:
202 wt['url'] = item['url']
203 waiting_for_url = []
205 if t.is_chapter():
206 spine.append(t.chapter)
207 section.append(item)
209 elif t.is_section():
210 section = item['children']
211 toc.append(item)
213 self.metadata = {
214 'version': 1,
215 'metadata': meta,
216 'TOC': toc,
217 'spine': spine,
218 'manifest': {},
221 self._parse_credits()
222 for c in self.contributors:
223 add_metadata(meta, 'contributor', c)
227 def make_bookizip(self, filename=None, use_cache=False):
228 """Extract all chapters, images, and metadata, and zip it all
229 up for conversion to epub.
231 If cache is true, images that have been fetched on previous
232 runs will be reused.
234 self._fetch_metadata()
235 if filename is None:
236 filename = self.filepath(self.bookname)
237 bz = BookiZip(filename, self.metadata)
239 all_images = set()
240 for chapter in self.metadata['spine']:
241 contents = self.get_chapter_html(chapter, wrapped=True)
242 c = TWikiChapter(self.server, self.book, chapter, contents,
243 use_cache=use_cache)
244 images = c.localise_links()
245 all_images.update(images)
246 #log(chapter, self.credits)
247 bz.add_to_package(chapter, chapter + '.html',
248 c.as_html(), **self.credits.get(chapter, {}))
250 # Add images afterwards, to sift out duplicates
251 for image in all_images:
252 imgdata = c.image_cache.read_local_url(image)
253 bz.add_to_package(image, image, imgdata) #XXX img ownership: where is it?
255 bz.finish()
256 return bz.filename
258 def get_chapter_html(self, chapter, wrapped=False):
259 url = config.CHAPTER_URL % (self.server, self.book, chapter)
260 log('getting chapter: %s' % url)
261 f = urlopen(url)
262 html = f.read()
263 f.close()
264 if wrapped:
265 html = CHAPTER_TEMPLATE % {
266 'title': '%s: %s' % (self.book, chapter),
267 'text': html,
268 'dir': self.dir
270 return html
272 def _parse_credits(self, force=False):
273 # open the Credits chapter that has a list of authors for each chapter.
274 # each chapter is listed thus (linebreaks added):
275 # <i>CHAPTER TITLE</i><br/>&copy; First Author 2007<br/>
276 # Modifications:<br/>Second Author 2007, 2008<br/>
277 # Third Author 2008<br/>Fourth Author 2008<br/><hr/>
279 # where "CHAPTER TITLE" is as appears in TOC.txt, and "X
280 # Author" are the names TWiki has for authors. So the thing
281 # to do is look for the <i> tags and match them to the toc.
283 # the chapter title is not guaranteed unique (but usually is).
284 if self.credits is not None and not force:
285 log("not reloading metadata")
286 return
288 self.credits = {}
289 self.contributors = set()
290 self.titles = []
292 credits_html = self.get_chapter_html('Credits', wrapped=True)
293 try:
294 parser = lxml.html.HTMLParser(encoding='utf-8')
295 tree = lxml.html.document_fromstring(credits_html, parser=parser)
296 except UnicodeDecodeError, e:
297 log("book isn't unicode! (%s)" %(e,))
298 encoding = config.SERVER_DEFAULTS[self.server]['toc-encoding']
299 parser = lxml.html.HTMLParser(encoding=encoding)
300 tree = lxml.html.document_fromstring(credits_html, parser=parser)
302 name_re = re.compile(r'^\s*(.+?) ((?:\d{4},? ?)+)$')
303 spine_iter = iter(self.metadata['spine'])
305 try:
306 for e in tree.iter('i'):
307 if e.tail or e.getnext().tag != 'br':
308 continue
309 title = e.text
310 chapter = spine_iter.next()
311 log("chapter %r title %r" % (chapter, title))
312 contributors = []
313 rightsholders = []
314 while True:
315 e = e.getnext()
316 if not e.tail or e.tag != 'br':
317 break
318 #log(e.tail)
319 if e.tail.startswith(u'\u00a9'): # \u00a9 == copyright symbol
320 m = name_re.match(e.tail[1:])
321 author, dates = m.groups()
322 rightsholders.append(author)
323 contributors.append(author)
324 else:
325 m = name_re.match(e.tail)
326 if m is not None:
327 author, dates = m.groups()
328 contributors.append(author)
330 self.credits[chapter] = {
331 "contributors":contributors,
332 "rightsholders": rightsholders,
334 self.titles.append(title)
335 self.contributors.update(contributors)
337 except StopIteration:
338 log('Apparently run out of chapters on title %s!' % title)
343 class TWikiChapter(BaseChapter):
344 image_cache = ImageCache()
346 def __init__(self, server, book, chapter_name, html, use_cache=False,
347 cache_dir=None):
348 self.server = server
349 self.book = book
350 self.name = chapter_name
351 self.use_cache = use_cache
352 if cache_dir:
353 self.image_cache = ImageCache(cache_dir)
354 self._loadtree(html)
356 def localise_links(self):
357 """Find image links, convert them to local links, and fetch
358 the images from the net so the local links work"""
359 images = []
360 def localise(oldlink):
361 fragments = urlsplit(oldlink)
362 if '.' not in fragments.path:
363 log('ignoring %s' % oldlink)
364 return oldlink
365 base, ext = fragments.path.rsplit('.', 1)
366 ext = ext.lower()
367 if (not fragments.scheme.startswith('http') or
368 (fragments.netloc != self.server and 'flossmanuals.net' not in fragments.netloc) or
369 ext not in ('png', 'gif', 'jpg', 'jpeg', 'svg', 'css', 'js') or
370 '/pub/' not in base
372 log('ignoring %s' % oldlink)
373 return oldlink
375 newlink = self.image_cache.fetch_if_necessary(oldlink, use_cache=self.use_cache)
376 if newlink is not None:
377 images.append(newlink)
378 return newlink
379 log("can't do anything for %s -- why?" % (oldlink,))
380 return oldlink
382 self.tree.rewrite_links(localise, base_href=('http://%s/bin/view/%s/%s' %
383 (self.server, self.book, self.name)))
384 return images
387 #XXX almost certainly broken and out of date!
388 class Author(object):
389 def __init__(self, name, email):
390 self.name = name
391 self.email = email
393 class ImportedChapter(TWikiChapter):
394 """Used for git import"""
395 def __init__(self, lang, book, chapter_name, text, author, email, date, server=None,
396 use_cache=False, cache_dir=None):
397 self.lang = lang
398 self.book = book
399 self.name = chapter_name
400 self.author = Author(author, email)
401 self.date = date
402 if server is None:
403 server = '%s.flossmanuals.net' % lang
404 self.server = server
405 self.use_cache = use_cache
406 if cache_dir:
407 self.image_cache = ImageCache(cache_dir)
408 #XXX is text html-wrapped?
409 self._loadtree(text)
411 def as_twikitext(self):
412 """Get the twiki-style guts of the chapter from the tree"""
413 text = etree.tostring(self.tree.find('body'), method='html')
414 text = re.sub(r'^.*?<body.*?>\s*', '', text)
415 text = re.sub(r'\s*</body>.*$', '\n', text)
416 return text