catch None urls in another place
[objavi2.git] / objavi / twiki_wrapper.py
blob6f6ceeee919fbed8a7438f5731a1d60c8b552194
1 """Fetch stuff from remote twiki instances"""
3 import os, sys, time, re
4 import tempfile
6 from objavi import config
7 from objavi.book_utils import log, guess_lang, guess_text_dir, make_book_name, decode_html_entities
8 from urllib2 import urlopen
9 from urlparse import urlsplit
10 from booki.bookizip import add_metadata, BookiZip
12 from objavi.xhtml_utils import BaseChapter, ImageCache
14 #from pprint import pformat
16 import lxml.html
17 from lxml import etree
19 CHAPTER_TEMPLATE = '''<html dir="%(dir)s">
20 <head>
21 <title>%(title)s</title>
22 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
23 </head>
24 <body>
25 %(text)s
26 </body>
27 </html>
28 '''
30 def get_book_list(server):
31 """Ask the server for a list of books. Floss Manual TWikis keep such a list at
32 /bin/view/TWiki/WebLeftBarWebsList?skin=text but it needs a bit of processing
34 If BOOK_LIST_CACHE is non-zero, the book list won't be re-fetched
35 in that many seconds, rather it will be read from disk.
36 """
37 if config.BOOK_LIST_CACHE:
38 cache_name = os.path.join(config.CACHE_DIR, '%s.booklist' % server)
39 if (os.path.exists(cache_name) and
40 os.stat(cache_name).st_mtime + config.BOOK_LIST_CACHE > time.time()):
41 f = open(cache_name)
42 s = f.read()
43 f.close()
44 return s.split()
46 url = config.CHAPTER_URL % (server, 'TWiki', 'WebLeftBarWebsList')
47 #url = 'http://%s/bin/view/TWiki/WebLeftBarWebsList?skin=text' % server
48 #XXX should use lxml
49 log('getting booklist: %s' % url)
50 f = urlopen(url)
51 s = f.read()
52 f.close()
53 items = sorted(x for x in re.findall(r'/bin/view/([\w/]+)/WebHome', s)
54 if x not in config.IGNORABLE_TWIKI_BOOKS)
55 if config.BOOK_LIST_CACHE:
56 f = open(cache_name, 'w')
57 f.write('\n'.join(items))
58 f.close()
59 return items
62 def toc_iterator(server, book):
63 """TOC.txt has 3 lines per chapter. Fetch them and yield them in
64 triples.
65 """
66 url = config.TOC_URL % (server, book)
67 log('getting TOC: %s' % url)
68 f = urlopen(url)
69 encoding = config.SERVER_DEFAULTS[server]['toc-encoding']
70 while True:
71 try:
72 if encoding is not None:
73 yield TocItem(f.next().decode(encoding).strip().encode('utf-8'),
74 f.next().decode(encoding).strip().encode('utf-8'),
75 f.next().decode(encoding).strip().encode('utf-8'))
76 else:
77 yield TocItem(f.next().strip(),
78 f.next().strip(),
79 f.next().strip())
80 except StopIteration:
81 break
82 f.close()
88 class TocItem(object):
89 """This makes sense of the tuples from TOC.txt files"""
90 def __init__(self, status, chapter, title):
91 # status is
92 # 0 - section heading with no chapter
93 # 1 - chapter heading
94 # 2 - book title
96 # chapter is twiki name of the chapter
97 # title is a human readable name of the chapter.
98 self.status = decode_html_entities(status)
99 self.chapter = decode_html_entities(chapter)
100 self.title = decode_html_entities(title)
102 def is_chapter(self):
103 return self.status == '1'
105 def is_section(self):
106 return self.status == '0'
108 def is_title(self):
109 return self.status == '2'
111 def __str__(self):
112 return '<toc: %s>' % ', '.join('%s: %s' % x for x in self.__dict__.iteritems())
114 def as_zipitem(self):
115 item = {
116 "title": self.title,
117 "url": self.chapter + '.html',
118 'type': 'chapter'
120 if self.is_section():
121 item["url"] = None
122 item['type'] = 'booki-section'
123 item['children'] = []
124 return item
127 class TWikiBook(object):
128 credits = None
129 metadata = None
130 def __init__(self, book, server, bookname=None):
131 if bookname is None:
132 bookname = make_book_name(book, server, '.zip')
133 log("*** Extracting TWiki book %s ***" % bookname)
134 self.bookname = bookname
135 self.book = book
136 self.server = server
137 self.workdir = tempfile.mkdtemp(prefix=bookname, dir=config.TMPDIR)
138 os.chmod(self.workdir, 0755)
139 #probable text direction
140 self.dir = guess_text_dir(self.server, self.book)
142 def filepath(self, fn):
143 return os.path.join(self.workdir, fn)
145 def _fetch_metadata(self, force=False):
146 """Get information about a twiki book (as much as is easy and
147 useful). If force is False (default) then it will not be
148 reloaded if it has already been set.
150 if self.metadata is not None and not force:
151 log("not reloading metadata")
152 return
153 meta = {
154 config.DC: {
155 "publisher": {
156 "": ["FLOSS Manuals http://flossmanuals.net"]
158 'identifier': {
159 "": ['http://%s/epub/%s/%s' %
160 (self.server, self.book, time.strftime('%Y.%m.%d-%H.%M.%S'))]
162 'creator': {
163 "": ['The Contributors']
165 'date': {
166 "": [time.strftime('%Y-%m-%d')]
168 'title': {
169 "": [self.book]
172 config.FM: {
173 'server': {"": [self.server]},
174 'book': {"": [self.book]},
178 lang = guess_lang(self.server, self.book)
179 self.dir = guess_text_dir(self.server, self.book)
180 #log(self.server, self.book, lang, self.dir)
181 if lang is not None:
182 add_metadata(meta, 'language', lang)
183 if self.dir is not None:
184 add_metadata(meta, 'dir', self.dir, ns=config.FM)
186 spine = []
187 toc = []
188 section = toc
189 waiting_for_url = []
191 for t in toc_iterator(self.server, self.book):
192 #log(t)
193 if t.is_title():
194 meta[config.DC]['title'][''] = [t.title]
195 continue
197 item = t.as_zipitem()
198 if item['url'] is None:
199 waiting_for_url.append(item)
200 elif waiting_for_url:
201 for wt in waiting_for_url:
202 wt['url'] = item['url']
203 waiting_for_url = []
205 if t.is_chapter():
206 spine.append(t.chapter)
207 section.append(item)
209 elif t.is_section():
210 section = item['children']
211 toc.append(item)
213 self.metadata = {
214 'version': 1,
215 'metadata': meta,
216 'TOC': toc,
217 'spine': spine,
218 'manifest': {},
221 self._parse_credits()
222 for c in self.contributors:
223 add_metadata(meta, 'contributor', c)
227 def make_bookizip(self, filename=None, use_cache=False):
228 """Extract all chapters, images, and metadata, and zip it all
229 up for conversion to epub.
231 If cache is true, images that have been fetched on previous
232 runs will be reused.
234 self._fetch_metadata()
235 if filename is None:
236 filename = self.filepath(self.bookname)
237 bz = BookiZip(filename, self.metadata)
239 all_images = set()
240 for chapter in self.metadata['spine']:
241 contents = self.get_chapter_html(chapter, wrapped=True)
242 c = TWikiChapter(self.server, self.book, chapter, contents,
243 use_cache=use_cache)
244 images = c.localise_links()
245 c.fix_bad_structure()
246 all_images.update(images)
247 #log(chapter, self.credits)
248 bz.add_to_package(chapter, chapter + '.html',
249 c.as_html(), **self.credits.get(chapter, {}))
251 # Add images afterwards, to sift out duplicates
252 for image in all_images:
253 imgdata = c.image_cache.read_local_url(image)
254 bz.add_to_package(image, image, imgdata) #XXX img ownership: where is it?
256 bz.finish()
257 return bz.filename
259 def get_chapter_html(self, chapter, wrapped=False):
260 url = config.CHAPTER_URL % (self.server, self.book, chapter)
261 log('getting chapter: %s' % url)
262 f = urlopen(url)
263 html = f.read()
264 f.close()
265 if wrapped:
266 html = CHAPTER_TEMPLATE % {
267 'title': '%s: %s' % (self.book, chapter),
268 'text': html,
269 'dir': self.dir
271 return html
273 def _parse_credits(self, force=False):
274 # open the Credits chapter that has a list of authors for each chapter.
275 # each chapter is listed thus (linebreaks added):
276 # <i>CHAPTER TITLE</i><br/>&copy; First Author 2007<br/>
277 # Modifications:<br/>Second Author 2007, 2008<br/>
278 # Third Author 2008<br/>Fourth Author 2008<br/><hr/>
280 # where "CHAPTER TITLE" is as appears in TOC.txt, and "X
281 # Author" are the names TWiki has for authors. So the thing
282 # to do is look for the <i> tags and match them to the toc.
284 # the chapter title is not guaranteed unique (but usually is).
285 if self.credits is not None and not force:
286 log("not reloading metadata")
287 return
289 self.credits = {}
290 self.contributors = set()
291 self.titles = []
293 credits_html = self.get_chapter_html('Credits', wrapped=True)
294 try:
295 parser = lxml.html.HTMLParser(encoding='utf-8')
296 tree = lxml.html.document_fromstring(credits_html, parser=parser)
297 except UnicodeDecodeError, e:
298 log("book isn't unicode! (%s)" %(e,))
299 encoding = config.SERVER_DEFAULTS[self.server]['toc-encoding']
300 parser = lxml.html.HTMLParser(encoding=encoding)
301 tree = lxml.html.document_fromstring(credits_html, parser=parser)
303 name_re = re.compile(r'^\s*(.+?) ((?:\d{4},? ?)+)$')
304 spine_iter = iter(self.metadata['spine'])
306 try:
307 for e in tree.iter('i'):
308 if e.tail or e.getnext().tag != 'br':
309 continue
310 chapter = spine_iter.next()
311 try:
312 title = e.text
313 except UnicodeDecodeError:
314 #The chapter's name has been encoded in iso-8859-1 and
315 #inserted into the utf-8 page with no transcoding.
316 title = ('Something to the effect of %r in a utf-8'
317 ' incompatible encoding') % chapter
318 log("chapter %r title %r" % (chapter, title))
319 contributors = []
320 rightsholders = []
321 while True:
322 e = e.getnext()
323 if not e.tail or e.tag != 'br':
324 break
325 #log(e.tail)
326 if e.tail.startswith(u'\u00a9'): # \u00a9 == copyright symbol
327 m = name_re.match(e.tail[1:])
328 author, dates = m.groups()
329 rightsholders.append(author)
330 contributors.append(author)
331 else:
332 m = name_re.match(e.tail)
333 if m is not None:
334 author, dates = m.groups()
335 contributors.append(author)
337 self.credits[chapter] = {
338 "contributors":contributors,
339 "rightsholders": rightsholders,
341 self.titles.append(title)
342 self.contributors.update(contributors)
344 except StopIteration:
345 log('Apparently run out of chapters on title %s!' % title)
350 class TWikiChapter(BaseChapter):
351 image_cache = ImageCache()
353 def __init__(self, server, book, chapter_name, html, use_cache=False,
354 cache_dir=None):
355 self.server = server
356 self.book = book
357 self.name = chapter_name
358 self.use_cache = use_cache
359 if cache_dir:
360 self.image_cache = ImageCache(cache_dir)
361 self._loadtree(html)
363 def localise_links(self):
364 """Find image links, convert them to local links, and fetch
365 the images from the net so the local links work"""
366 images = []
367 def localise(oldlink):
368 fragments = urlsplit(oldlink)
369 if '.' not in fragments.path:
370 log('ignoring %s' % oldlink)
371 return oldlink
372 base, ext = fragments.path.rsplit('.', 1)
373 ext = ext.lower()
374 if (not fragments.scheme.startswith('http') or
375 (fragments.netloc != self.server and 'flossmanuals.net' not in fragments.netloc) or
376 ext not in ('png', 'gif', 'jpg', 'jpeg', 'svg', 'css', 'js') or
377 '/pub/' not in base
379 log('ignoring %s' % oldlink)
380 return oldlink
382 newlink = self.image_cache.fetch_if_necessary(oldlink, use_cache=self.use_cache)
383 if newlink is not None:
384 images.append(newlink)
385 return newlink
386 log("can't do anything for %s -- why?" % (oldlink,))
387 return oldlink
389 self.tree.rewrite_links(localise, base_href=('http://%s/bin/view/%s/%s' %
390 (self.server, self.book, self.name)))
391 return images
394 #XXX almost certainly broken and out of date!
395 class Author(object):
396 def __init__(self, name, email):
397 self.name = name
398 self.email = email
400 class ImportedChapter(TWikiChapter):
401 """Used for git import"""
402 def __init__(self, lang, book, chapter_name, text, author, email, date, server=None,
403 use_cache=False, cache_dir=None):
404 self.lang = lang
405 self.book = book
406 self.name = chapter_name
407 self.author = Author(author, email)
408 self.date = date
409 if server is None:
410 server = '%s.flossmanuals.net' % lang
411 self.server = server
412 self.use_cache = use_cache
413 if cache_dir:
414 self.image_cache = ImageCache(cache_dir)
415 #XXX is text html-wrapped?
416 self._loadtree(text)
418 def as_twikitext(self):
419 """Get the twiki-style guts of the chapter from the tree"""
420 text = etree.tostring(self.tree.find('body'), method='html')
421 text = re.sub(r'^.*?<body.*?>\s*', '', text)
422 text = re.sub(r'\s*</body>.*$', '\n', text)
423 return text