1 """Fetch stuff from remote twiki instances"""
3 import os
, sys
, time
, re
6 from objavi
import config
7 from objavi
.book_utils
import log
, guess_lang
, guess_text_dir
, make_book_name
8 from urllib2
import urlopen
9 from urlparse
import urlsplit
10 from booki
.bookizip
import add_metadata
, BookiZip
12 from objavi
.xhtml_utils
import BaseChapter
, ImageCache
14 #from pprint import pformat
17 from lxml
import etree
19 CHAPTER_TEMPLATE
= '''<html dir="%(dir)s">
21 <title>%(title)s</title>
22 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
30 def get_book_list(server
):
31 """Ask the server for a list of books. Floss Manual TWikis keep such a list at
32 /bin/view/TWiki/WebLeftBarWebsList?skin=text but it needs a bit of processing
34 If BOOK_LIST_CACHE is non-zero, the book list won't be re-fetched
35 in that many seconds, rather it will be read from disk.
37 if config
.BOOK_LIST_CACHE
:
38 cache_name
= os
.path
.join(config
.BOOK_LIST_CACHE_DIR
, '%s.booklist' % server
)
39 if (os
.path
.exists(cache_name
) and
40 os
.stat(cache_name
).st_mtime
+ config
.BOOK_LIST_CACHE
> time
.time()):
46 url
= config
.CHAPTER_URL
% (server
, 'TWiki', 'WebLeftBarWebsList')
47 #url = 'http://%s/bin/view/TWiki/WebLeftBarWebsList?skin=text' % server
49 log('getting booklist: %s' % url
)
53 items
= sorted(x
for x
in re
.findall(r
'/bin/view/([\w/]+)/WebHome', s
)
54 if x
not in config
.IGNORABLE_TWIKI_BOOKS
)
55 if config
.BOOK_LIST_CACHE
:
56 f
= open(cache_name
, 'w')
57 f
.write('\n'.join(items
))
62 def toc_iterator(server
, book
):
63 """TOC.txt has 3 lines per chapter. Fetch them and yield them in
66 url
= config
.TOC_URL
% (server
, book
)
67 log('getting TOC: %s' % url
)
69 encoding
= config
.SERVER_DEFAULTS
[server
]['toc-encoding']
72 if encoding
is not None:
73 yield TocItem(f
.next().decode(encoding
).strip().encode('utf-8'),
74 f
.next().decode(encoding
).strip().encode('utf-8'),
75 f
.next().decode(encoding
).strip().encode('utf-8'))
77 yield TocItem(f
.next().strip(),
88 class TocItem(object):
89 """This makes sense of the tuples from TOC.txt files"""
90 def __init__(self
, status
, chapter
, title
):
92 # 0 - section heading with no chapter
96 # chapter is twiki name of the chapter
97 # title is a human readable name of the chapter.
99 self
.chapter
= chapter
102 def is_chapter(self
):
103 return self
.status
== '1'
105 def is_section(self
):
106 return self
.status
== '0'
109 return self
.status
== '2'
112 return '<toc: %s>' % ', '.join('%s: %s' % x
for x
in self
.__dict
__.iteritems())
114 def as_zipitem(self
):
117 "url": self
.chapter
+ '.html',
120 if self
.is_section():
122 item
['type'] = 'booki-section'
123 item
['children'] = []
127 class TWikiBook(object):
130 def __init__(self
, book
, server
, bookname
=None):
132 bookname
= make_book_name(book
, server
, '.zip')
133 log("*** Extracting TWiki book %s ***" % bookname
)
134 self
.bookname
= bookname
137 self
.workdir
= tempfile
.mkdtemp(prefix
=bookname
, dir=config
.TMPDIR
)
138 os
.chmod(self
.workdir
, 0755)
139 #probable text direction
140 self
.dir = guess_text_dir(self
.server
, self
.book
)
142 def filepath(self
, fn
):
143 return os
.path
.join(self
.workdir
, fn
)
145 def _fetch_metadata(self
, force
=False):
146 """Get information about a twiki book (as much as is easy and
147 useful). If force is False (default) then it will not be
148 reloaded if it has already been set.
150 if self
.metadata
is not None and not force
:
151 log("not reloading metadata")
156 "": ["FLOSS Manuals http://flossmanuals.net"]
159 "": ['http://%s/epub/%s/%s' %
160 (self
.server
, self
.book
, time
.strftime('%Y.%m.%d-%H.%M.%S'))]
163 "": ['The Contributors']
166 "": [time
.strftime('%Y-%m-%d')]
173 'server': {"": [self
.server
]},
174 'book': {"": [self
.book
]},
178 lang
= guess_lang(self
.server
, self
.book
)
179 self
.dir = guess_text_dir(self
.server
, self
.book
)
180 #log(self.server, self.book, lang, self.dir)
182 add_metadata(meta
, 'language', lang
)
183 if self
.dir is not None:
184 add_metadata(meta
, 'dir', self
.dir, ns
=config
.FM
)
191 for t
in toc_iterator(self
.server
, self
.book
):
194 meta
[config
.DC
]['title'][''] = [t
.title
]
197 item
= t
.as_zipitem()
198 if item
['url'] is None:
199 waiting_for_url
.append(item
)
200 elif waiting_for_url
:
201 for wt
in waiting_for_url
:
202 wt
['url'] = item
['url']
206 spine
.append(t
.chapter
)
210 section
= item
['children']
221 self
._parse
_credits
()
222 for c
in self
.contributors
:
223 add_metadata(meta
, 'contributor', c
)
227 def make_bookizip(self
, filename
=None, use_cache
=False):
228 """Extract all chapters, images, and metadata, and zip it all
229 up for conversion to epub.
231 If cache is true, images that have been fetched on previous
234 self
._fetch
_metadata
()
236 filename
= self
.filepath(self
.bookname
)
237 bz
= BookiZip(filename
, self
.metadata
)
240 for chapter
in self
.metadata
['spine']:
241 contents
= self
.get_chapter_html(chapter
, wrapped
=True)
242 c
= TWikiChapter(self
.server
, self
.book
, chapter
, contents
,
244 images
= c
.localise_links()
245 all_images
.update(images
)
246 #log(chapter, self.credits)
247 bz
.add_to_package(chapter
, chapter
+ '.html',
248 c
.as_html(), **self
.credits
.get(chapter
, {}))
250 # Add images afterwards, to sift out duplicates
251 for image
in all_images
:
252 imgdata
= c
.image_cache
.read_local_url(image
)
253 bz
.add_to_package(image
, image
, imgdata
) #XXX img ownership: where is it?
258 def get_chapter_html(self
, chapter
, wrapped
=False):
259 url
= config
.CHAPTER_URL
% (self
.server
, self
.book
, chapter
)
260 log('getting chapter: %s' % url
)
265 html
= CHAPTER_TEMPLATE
% {
266 'title': '%s: %s' % (self
.book
, chapter
),
272 def _parse_credits(self
, force
=False):
273 # open the Credits chapter that has a list of authors for each chapter.
274 # each chapter is listed thus (linebreaks added):
275 # <i>CHAPTER TITLE</i><br/>© First Author 2007<br/>
276 # Modifications:<br/>Second Author 2007, 2008<br/>
277 # Third Author 2008<br/>Fourth Author 2008<br/><hr/>
279 # where "CHAPTER TITLE" is as appears in TOC.txt, and "X
280 # Author" are the names TWiki has for authors. So the thing
281 # to do is look for the <i> tags and match them to the toc.
283 # the chapter title is not guaranteed unique (but usually is).
284 if self
.credits
is not None and not force
:
285 log("not reloading metadata")
289 self
.contributors
= set()
292 credits_html
= self
.get_chapter_html('Credits', wrapped
=True)
294 parser
= lxml
.html
.HTMLParser(encoding
='utf-8')
295 tree
= lxml
.html
.document_fromstring(credits_html
, parser
=parser
)
296 except UnicodeDecodeError, e
:
297 log("book isn't unicode! (%s)" %(e
,))
298 encoding
= config
.SERVER_DEFAULTS
[self
.server
]['toc-encoding']
299 parser
= lxml
.html
.HTMLParser(encoding
=encoding
)
300 tree
= lxml
.html
.document_fromstring(credits_html
, parser
=parser
)
302 name_re
= re
.compile(r
'^\s*(.+?) ((?:\d{4},? ?)+)$')
303 spine_iter
= iter(self
.metadata
['spine'])
306 for e
in tree
.iter('i'):
307 if e
.tail
or e
.getnext().tag
!= 'br':
310 chapter
= spine_iter
.next()
311 log("chapter %r title %r" % (chapter
, title
))
316 if not e
.tail
or e
.tag
!= 'br':
319 if e
.tail
.startswith(u
'\u00a9'): # \u00a9 == copyright symbol
320 m
= name_re
.match(e
.tail
[1:])
321 author
, dates
= m
.groups()
322 rightsholders
.append(author
)
323 contributors
.append(author
)
325 m
= name_re
.match(e
.tail
)
327 author
, dates
= m
.groups()
328 contributors
.append(author
)
330 self
.credits
[chapter
] = {
331 "contributors":contributors
,
332 "rightsholders": rightsholders
,
334 self
.titles
.append(title
)
335 self
.contributors
.update(contributors
)
337 except StopIteration:
338 log('Apparently run out of chapters on title %s!' % title
)
343 class TWikiChapter(BaseChapter
):
344 image_cache
= ImageCache()
346 def __init__(self
, server
, book
, chapter_name
, html
, use_cache
=False,
350 self
.name
= chapter_name
351 self
.use_cache
= use_cache
353 self
.image_cache
= ImageCache(cache_dir
)
356 def localise_links(self
):
357 """Find image links, convert them to local links, and fetch
358 the images from the net so the local links work"""
360 def localise(oldlink
):
361 fragments
= urlsplit(oldlink
)
362 if '.' not in fragments
.path
:
363 log('ignoring %s' % oldlink
)
365 base
, ext
= fragments
.path
.rsplit('.', 1)
367 if (not fragments
.scheme
.startswith('http') or
368 (fragments
.netloc
!= self
.server
and 'flossmanuals.net' not in fragments
.netloc
) or
369 ext
not in ('png', 'gif', 'jpg', 'jpeg', 'svg', 'css', 'js') or
372 log('ignoring %s' % oldlink
)
375 newlink
= self
.image_cache
.fetch_if_necessary(oldlink
, use_cache
=self
.use_cache
)
376 if newlink
is not None:
377 images
.append(newlink
)
379 log("can't do anything for %s -- why?" % (oldlink
,))
382 self
.tree
.rewrite_links(localise
, base_href
=('http://%s/bin/view/%s/%s' %
383 (self
.server
, self
.book
, self
.name
)))
387 #XXX almost certainly broken and out of date!
388 class Author(object):
389 def __init__(self
, name
, email
):
393 class ImportedChapter(TWikiChapter
):
394 """Used for git import"""
395 def __init__(self
, lang
, book
, chapter_name
, text
, author
, email
, date
, server
=None,
396 use_cache
=False, cache_dir
=None):
399 self
.name
= chapter_name
400 self
.author
= Author(author
, email
)
403 server
= '%s.flossmanuals.net' % lang
405 self
.use_cache
= use_cache
407 self
.image_cache
= ImageCache(cache_dir
)
408 #XXX is text html-wrapped?
411 def as_twikitext(self
):
412 """Get the twiki-style guts of the chapter from the tree"""
413 text
= etree
.tostring(self
.tree
.find('body'), method
='html')
414 text
= re
.sub(r
'^.*?<body.*?>\s*', '', text
)
415 text
= re
.sub(r
'\s*</body>.*$', '\n', text
)