use wkhtmltopdf paper size, except for multi-column pdf
[objavi2.git] / objavi / pdf.py
blob136b5cdb85bd756df5d7273184f925211252c09e
1 # Part of Objavi2, which turns html manuals into books.
2 # This deals with PDF and page specific concepts.
4 # Copyright (C) 2009 Douglas Bagnall
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License along
17 # with this program; if not, write to the Free Software Foundation, Inc.,
18 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
20 """Fiddly stuff to do with pages and PDFs."""
22 import os, sys
23 import re
24 from subprocess import Popen, PIPE
25 import urllib
27 from objavi import config
28 from objavi.book_utils import log, run
29 from objavi.cgi_utils import path2url
31 def find_containing_paper(w, h):
32 for name, pw, ph in config.PAPER_SIZES:
33 if pw >= w and ph >= h:
34 mw = (pw - w) * 0.5
35 mh = (ph - h) * 0.5
36 return (name, mw, mh)
38 raise ValueError("page sized %.2fmm x %.2fmm won't fit on any paper!" %
39 (w * config.POINT_2_MM, h * config.POINT_2_MM))
41 class PageSettings(object):
42 """Calculates and wraps commands for the generation and processing
43 of PDFs"""
44 def __init__(self, pointsize, **kwargs):
45 # the formulas for default gutters, margins and column margins
46 # are quite ad-hoc and certainly improvable.
47 self.width, self.height = pointsize
48 self.papersize, clipx, clipy = find_containing_paper(self.width, self.height)
49 self.grey_scale = 'grey_scale' in kwargs
51 self.engine = kwargs.get('engine', config.DEFAULT_ENGINE)
52 # All measurements in points unless otherwise stated
53 # user interaction is in *mm*, but is converted in objavi2.py
54 default_margin = (config.BASE_MARGIN + config.PROPORTIONAL_MARGIN * min(pointsize))
55 default_gutter = (config.BASE_GUTTER + config.PROPORTIONAL_GUTTER * self.width)
57 self.top_margin = kwargs.get('top_margin', default_margin)
58 self.side_margin = kwargs.get('side_margin', default_margin)
59 self.bottom_margin = kwargs.get('bottom_margin', default_margin)
60 self.gutter = kwargs.get('gutter', default_gutter)
62 self.columns = kwargs.get('columns', 1)
63 if self.columns == 'auto': #default for newspapers is to work out columns
64 self.columns = int(self.width // config.MIN_COLUMN_WIDTH)
66 self.column_margin = kwargs.get('column_margin',
67 default_margin * 2 / (5.0 + self.columns))
69 self.number_bottom = self.bottom_margin - 0.6 * config.PAGE_NUMBER_SIZE
70 self.number_margin = self.side_margin
72 # calculate margins in mm for browsers
73 self.margins = []
74 ## for m, clip in ((self.top_margin, clipy),
75 ## (self.side_margin, clipx + 0.5 * self.gutter),
76 ## (self.bottom_margin, clipy + 0.5 * config.PAGE_NUMBER_SIZE),
77 ## (self.side_margin, clipx + 0.5 * self.gutter),
78 ## ):
79 ## self.margins.append((m + clip) * config.POINT_2_MM)
80 for m, clip in ((self.top_margin, 0),
81 (self.side_margin, 0.5 * self.gutter),
82 (self.bottom_margin, 0.5 * config.PAGE_NUMBER_SIZE),
83 (self.side_margin, 0.5 * self.gutter),
85 self.margins.append((m + clip) * config.POINT_2_MM)
87 if 'PDFGEN' in config.DEBUG_MODES:
88 log("making PageSettings with:")
89 for x in locals().iteritems():
90 log("%s: %s" % x, debug='PDFGEN')
91 for x in dir(self):
92 if not x.startswith('__'):
93 log("self.%s: %s" % (x, getattr(self, x)), debug='PDFGEN')
97 def _webkit_command(self, html, pdf, outline=False, outline_file=None):
98 m = [str(x) for x in self.margins]
99 outline_args = ['--outline', '--outline-depth', '2'] * outline
100 if outline_file is not None:
101 outline_args += ['--dump-outline', outline_file]
103 greyscale_args = ['-g'] * self.grey_scale
104 quiet_args = ['-q']
105 cmd = ([config.WKHTMLTOPDF] +
106 quiet_args +
107 ['--page-width', str(self.width * config.POINT_2_MM),
108 '--page-height', str(self.height * config.POINT_2_MM),
109 '-T', m[0], '-R', m[1], '-B', m[2], '-L', m[3],
110 #'--disable-smart-shrinking',
111 '-d', '100',
112 #'--zoom', '1.2',
114 outline_args +
115 greyscale_args +
116 config.WKHTMLTOPDF_EXTRA_COMMANDS +
117 [html, pdf])
118 log(' '.join(cmd))
119 return cmd
122 def make_raw_pdf(self, html, pdf, outline=False, outline_file=None):
123 html_url = path2url(html, full=True)
124 func = getattr(self, '_%s_command' % self.engine)
125 if self.columns == 1:
126 cmd = func(html_url, pdf, outline=outline, outline_file=outline_file)
127 run(cmd)
128 else:
129 printable_width = self.width - 2.0 * self.side_margin - self.gutter
130 column_width = (printable_width - (self.columns - 1) * self.column_margin) / self.columns
131 page_width = column_width + self.column_margin
132 side_margin = self.column_margin * 0.5
133 if 'PDFGEN' in config.DEBUG_MODES:
134 log("making columns with:")
135 for k, v in locals().iteritems():
136 log("%s: %r" % (k, v))
137 for k in ('width', 'side_margin', 'gutter', 'column_margin', 'columns', 'height'):
138 log("self.%s: %r" % (k, getattr(self, k)))
140 columnmaker = PageSettings((page_width, self.height),
141 gutter=0, top_margin=self.top_margin,
142 side_margin=side_margin,
143 bottom_margin=self.bottom_margin,
144 grey_scale=self.grey_scale,
145 engine=self.engine
148 column_pdf = pdf[:-4] + '-single-column.pdf'
149 columnmaker.make_raw_pdf(html, column_pdf, outline=outline, outline_file=outline_file)
150 columnmaker.reshape_pdf(column_pdf)
151 cmd = ['pdfnup',
152 '--nup', '%sx1' % int(self.columns),
153 '--paper', self.papersize.lower() + 'paper',
154 '--outfile', pdf,
155 '--offset', '0 0', #'%scm 0' % (self.margins[1] * 0.1),
156 '--noautoscale', 'true',
157 '--orient', 'portrait',
158 #'--tidy', 'false',
159 column_pdf
162 run(cmd)
166 def reshape_pdf(self, pdf, dir='LTR', centre_start=False, centre_end=False,
167 even_pages=True):
168 """Spin the pdf for RTL text, resize it to the right size, and
169 shift the gutter left and right"""
170 ops = []
171 if self.columns > 1:
172 ops.append('resize')
173 if self.gutter:
174 ops.append('shift')
175 if even_pages:
176 ops.append('even_pages')
177 gutter = self.gutter
178 if dir == 'RTL':
179 gutter = -gutter
180 if not ops:
181 return
183 cmd = ['pdfedit', '-s', 'wk_objavi.qs',
184 'dir=%s' % dir,
185 'filename=%s' % pdf,
186 'output_filename=%s' % pdf,
187 'operation=%s' % ','.join(ops),
188 'width=%s' % self.width,
189 'height=%s' % self.height,
190 'offset=%s' % gutter,
191 'centre_start=%s' % centre_start,
192 'centre_end=%s' % centre_end,
194 run(cmd)
196 def _number_pdf(self, pdf, numbers='latin', dir='LTR',
197 number_start=1):
198 cmd = ['pdfedit', '-s', 'wk_objavi.qs',
199 'operation=page_numbers',
200 'dir=%s' % dir,
201 'filename=%s' % pdf,
202 'output_filename=%s' % pdf,
203 'number_start=%s' % number_start,
204 'number_style=%s' % numbers,
205 'number_bottom=%s' % self.number_bottom,
206 'number_margin=%s' % self.number_margin,
208 run(cmd)
210 def number_pdf(self, pdf, pages, **kwargs):
211 # if there are too many pages for pdfedit to handle in one go,
212 # split the job into bits. <pages> may not be exact
213 if pages is None or pages <= config.PDFEDIT_MAX_PAGES:
214 self._number_pdf(pdf, **kwargs)
215 else:
216 # section_size must be even
217 sections = pages // config.PDFEDIT_MAX_PAGES + 1
218 section_size = (pages // sections + 2) & ~1
220 pdf_sections = []
221 s = kwargs.pop('number_start', 1)
222 while s < pages:
223 e = s + section_size - 1
224 pdf_section = '%s-%s-%s.pdf' % (pdf[:-4], s, e)
225 if e < pages - 1:
226 page_range = '%s-%s' % (s, e)
227 else:
228 page_range = '%s-end' % s
229 run(['pdftk',
230 pdf,
231 'cat',
232 page_range,
233 'output',
234 pdf_section,
236 self._number_pdf(pdf_section, number_start=s, **kwargs)
237 pdf_sections.append(pdf_section)
238 s = e + 1
240 concat_pdfs(pdf, *pdf_sections)
242 def make_barcode_pdf(self, isbn, pdf, corner='br'):
243 """Put an ISBN barcode in a corner of a single blank page."""
245 position = '%s,%s,%s,%s,%s' % (corner, self.width, self.height, self.side_margin, self.bottom_margin)
246 cmd1 = [config.BOOKLAND,
247 '--position', position,
248 str(isbn)]
249 cmd2 = ['ps2pdf',
250 '-dFIXEDMEDIA',
251 '-dDEVICEWIDTHPOINTS=%s' % self.width,
252 '-dDEVICEHEIGHTPOINTS=%s' % self.height,
253 '-', pdf]
255 p1 = Popen(cmd1, stdout=PIPE)
256 p2 = Popen(cmd2, stdin=p1.stdout, stdout=PIPE, stderr=PIPE)
257 out, err = p2.communicate()
259 log('ran:\n%s | %s' % (' '.join(cmd1), ' '.join(cmd2)))
260 log("return: %s and %s \nstdout:%s \nstderr:%s" % (p1.poll(), p2.poll(), out, err))
263 def count_pdf_pages(pdf):
264 """How many pages in the PDF?"""
265 #XXX could also use python-pypdf or python-poppler
266 cmd = ('pdfinfo', pdf)
267 p = Popen(cmd, stdout=PIPE, stderr=PIPE)
268 out, err = p.communicate()
269 m = re.search(r'^\s*Pages:\s*(\d+)\s*$', out, re.MULTILINE)
270 return int(m.group(1))
273 def concat_pdfs(destination, *pdfs):
274 """Join all the named pdfs together into one and save it as <name>"""
275 cmd = ['pdftk']
276 cmd.extend(x for x in pdfs if x is not None)
277 cmd += ['cat', 'output', destination]
278 run(cmd)
280 def rotate_pdf(pdfin, pdfout):
281 """Turn the PDF on its head"""
282 cmd = ['pdftk', pdfin,
283 'cat',
284 '1-endD',
285 'output',
286 pdfout
288 run(cmd)
290 def parse_extracted_outline(outline_file, depth=config.CONTENTS_DEPTH):
291 """Extract outline data from a file structured as follows:
293 The first line contains the text "Pages: ", followed by the total
294 number of pages in the PDF (all numbers are in ascii decimal
295 digits).
297 Each following line looks like this
299 "Level: " <level> " Page: " <page number> " Title: " <title> "\n"
301 where <level> is an integer indicating the significance of the
302 heading, <page number> is self-explanatory, and <title> is the
303 title encoded as utf-8 text that has been escaped as in a URI:
304 non-alphanumeric characters are replaced by "%" followed by two
305 hexadecimal digits giving their value (This escaping system is
306 variously called "url-encoding" or "percent-encoding" and is
307 described in section 2.1 of RFC 3986).
309 f = open(outline_file, 'r')
310 page_line = f.next()
311 log(page_line)
312 page_count = int(re.match('^Pages: (\d+)', page_line).group(1))
314 contents = []
315 for line in f:
316 m = re.match('Level: (\d+)\s+Page: (\d+)\s+Title: (\S*)', line)
317 level = int(m.group(1))
318 if level > depth:
319 continue
320 pagenum = int(m.group(2))
321 title = urllib.unquote(m.group(3)).strip()#.decode('utf-8')
322 if not title:
323 log("WARNING: heading level %s on page %s is empty string" % (level, pagenum))
324 contents.append((title, level, pagenum))
326 return contents, page_count
330 def parse_outline(pdf, level_threshold, debug_filename=None):
331 """Create a structure reflecting the outline of a PDF.
332 A chapter heading looks like this:
334 BookmarkTitle: 2. What is sound?
335 BookmarkLevel: 1
336 BookmarkPageNumber: 3
338 cmd = ('pdftk', pdf, 'dump_data')
339 p = Popen(cmd, stdout=PIPE, stderr=PIPE)
340 outline, err = p.communicate()
341 #log("OUTLINE:", outline)
342 if debug_filename is not None:
343 try:
344 f = open(debug_filename, 'w')
345 f.write(outline)
346 f.close()
347 except IOError:
348 log("could not write to %s!" % debug_filename)
350 lines = (x.strip() for x in outline.split('\n') if x.strip())
351 contents = []
353 def _strip(s):
354 return s.strip(config.WHITESPACE_AND_NULL)
356 def extract(expected, conv=_strip):
357 line = lines.next()
358 try:
359 k, v = line.split(':', 1)
360 if k == expected:
361 return conv(v)
362 except ValueError:
363 log("trouble with line %r" %line)
365 #There are a few useless variables, then the pagecount, then the contents.
366 #The pagecount is useful, so pick it up first.
367 page_count = None
368 while page_count == None:
369 page_count = extract('NumberOfPages', int)
371 try:
372 while True:
373 title = extract('BookmarkTitle')
374 if title is not None:
375 level = extract('BookmarkLevel', int)
376 pagenum = extract('BookmarkPageNumber', int)
377 if level <= level_threshold and None not in (level, pagenum):
378 contents.append((title, level, pagenum))
379 except StopIteration:
380 pass
382 return contents, page_count