1 # Part of Objavi2, which turns html manuals into books.
2 # This deals with PDF and page specific concepts.
4 # Copyright (C) 2009 Douglas Bagnall
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation; either version 2 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License along
17 # with this program; if not, write to the Free Software Foundation, Inc.,
18 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
20 """Fiddly stuff to do with pages and PDFs."""
24 from subprocess
import Popen
, PIPE
27 from objavi
import config
28 from objavi
.book_utils
import log
, run
29 from objavi
.cgi_utils
import path2url
31 def find_containing_paper(w
, h
):
32 for name
, pw
, ph
in config
.PAPER_SIZES
:
33 if pw
>= w
and ph
>= h
:
38 raise ValueError("page sized %.2fmm x %.2fmm won't fit on any paper!" %
39 (w
* config
.POINT_2_MM
, h
* config
.POINT_2_MM
))
41 class PageSettings(object):
42 """Calculates and wraps commands for the generation and processing
44 def __init__(self
, pointsize
, **kwargs
):
45 # the formulas for default gutters, margins and column margins
46 # are quite ad-hoc and certainly improvable.
47 self
.width
, self
.height
= pointsize
48 self
.papersize
, clipx
, clipy
= find_containing_paper(self
.width
, self
.height
)
49 self
.grey_scale
= 'grey_scale' in kwargs
51 self
.engine
= kwargs
.get('engine', config
.DEFAULT_ENGINE
)
52 # All measurements in points unless otherwise stated
53 # user interaction is in *mm*, but is converted in objavi2.py
54 default_margin
= (config
.BASE_MARGIN
+ config
.PROPORTIONAL_MARGIN
* min(pointsize
))
55 default_gutter
= (config
.BASE_GUTTER
+ config
.PROPORTIONAL_GUTTER
* self
.width
)
57 self
.top_margin
= kwargs
.get('top_margin', default_margin
)
58 self
.side_margin
= kwargs
.get('side_margin', default_margin
)
59 self
.bottom_margin
= kwargs
.get('bottom_margin', default_margin
)
60 self
.gutter
= kwargs
.get('gutter', default_gutter
)
62 self
.columns
= kwargs
.get('columns', 1)
63 if self
.columns
== 'auto': #default for newspapers is to work out columns
64 self
.columns
= int(self
.width
// config
.MIN_COLUMN_WIDTH
)
66 self
.column_margin
= kwargs
.get('column_margin',
67 default_margin
* 2 / (5.0 + self
.columns
))
69 self
.number_bottom
= self
.bottom_margin
- 0.6 * config
.PAGE_NUMBER_SIZE
70 self
.number_margin
= self
.side_margin
72 # calculate margins in mm for browsers
74 ## for m, clip in ((self.top_margin, clipy),
75 ## (self.side_margin, clipx + 0.5 * self.gutter),
76 ## (self.bottom_margin, clipy + 0.5 * config.PAGE_NUMBER_SIZE),
77 ## (self.side_margin, clipx + 0.5 * self.gutter),
79 ## self.margins.append((m + clip) * config.POINT_2_MM)
80 for m
, clip
in ((self
.top_margin
, 0),
81 (self
.side_margin
, 0.5 * self
.gutter
),
82 (self
.bottom_margin
, 0.5 * config
.PAGE_NUMBER_SIZE
),
83 (self
.side_margin
, 0.5 * self
.gutter
),
85 self
.margins
.append((m
+ clip
) * config
.POINT_2_MM
)
87 if 'PDFGEN' in config
.DEBUG_MODES
:
88 log("making PageSettings with:")
89 for x
in locals().iteritems():
90 log("%s: %s" % x
, debug
='PDFGEN')
92 if not x
.startswith('__'):
93 log("self.%s: %s" % (x
, getattr(self
, x
)), debug
='PDFGEN')
97 def _webkit_command(self
, html
, pdf
, outline
=False, outline_file
=None):
98 m
= [str(x
) for x
in self
.margins
]
99 outline_args
= ['--outline', '--outline-depth', '2'] * outline
100 if outline_file
is not None:
101 outline_args
+= ['--dump-outline', outline_file
]
103 greyscale_args
= ['-g'] * self
.grey_scale
105 cmd
= ([config
.WKHTMLTOPDF
] +
107 ['--page-width', str(self
.width
* config
.POINT_2_MM
),
108 '--page-height', str(self
.height
* config
.POINT_2_MM
),
109 '-T', m
[0], '-R', m
[1], '-B', m
[2], '-L', m
[3],
110 #'--disable-smart-shrinking',
116 config
.WKHTMLTOPDF_EXTRA_COMMANDS
+
122 def make_raw_pdf(self
, html
, pdf
, outline
=False, outline_file
=None):
123 html_url
= path2url(html
, full
=True)
124 func
= getattr(self
, '_%s_command' % self
.engine
)
125 if self
.columns
== 1:
126 cmd
= func(html_url
, pdf
, outline
=outline
, outline_file
=outline_file
)
129 printable_width
= self
.width
- 2.0 * self
.side_margin
- self
.gutter
130 column_width
= (printable_width
- (self
.columns
- 1) * self
.column_margin
) / self
.columns
131 page_width
= column_width
+ self
.column_margin
132 side_margin
= self
.column_margin
* 0.5
133 if 'PDFGEN' in config
.DEBUG_MODES
:
134 log("making columns with:")
135 for k
, v
in locals().iteritems():
136 log("%s: %r" % (k
, v
))
137 for k
in ('width', 'side_margin', 'gutter', 'column_margin', 'columns', 'height'):
138 log("self.%s: %r" % (k
, getattr(self
, k
)))
140 columnmaker
= PageSettings((page_width
, self
.height
),
141 gutter
=0, top_margin
=self
.top_margin
,
142 side_margin
=side_margin
,
143 bottom_margin
=self
.bottom_margin
,
144 grey_scale
=self
.grey_scale
,
148 column_pdf
= pdf
[:-4] + '-single-column.pdf'
149 columnmaker
.make_raw_pdf(html
, column_pdf
, outline
=outline
, outline_file
=outline_file
)
150 columnmaker
.reshape_pdf(column_pdf
)
152 '--nup', '%sx1' % int(self
.columns
),
153 '--paper', self
.papersize
.lower() + 'paper',
155 '--offset', '0 0', #'%scm 0' % (self.margins[1] * 0.1),
156 '--noautoscale', 'true',
157 '--orient', 'portrait',
166 def reshape_pdf(self
, pdf
, dir='LTR', centre_start
=False, centre_end
=False,
168 """Spin the pdf for RTL text, resize it to the right size, and
169 shift the gutter left and right"""
176 ops
.append('even_pages')
183 cmd
= ['pdfedit', '-s', 'wk_objavi.qs',
186 'output_filename=%s' % pdf
,
187 'operation=%s' % ','.join(ops
),
188 'width=%s' % self
.width
,
189 'height=%s' % self
.height
,
190 'offset=%s' % gutter
,
191 'centre_start=%s' % centre_start
,
192 'centre_end=%s' % centre_end
,
196 def _number_pdf(self
, pdf
, numbers
='latin', dir='LTR',
198 cmd
= ['pdfedit', '-s', 'wk_objavi.qs',
199 'operation=page_numbers',
202 'output_filename=%s' % pdf
,
203 'number_start=%s' % number_start
,
204 'number_style=%s' % numbers
,
205 'number_bottom=%s' % self
.number_bottom
,
206 'number_margin=%s' % self
.number_margin
,
210 def number_pdf(self
, pdf
, pages
, **kwargs
):
211 # if there are too many pages for pdfedit to handle in one go,
212 # split the job into bits. <pages> may not be exact
213 if pages
is None or pages
<= config
.PDFEDIT_MAX_PAGES
:
214 self
._number
_pdf
(pdf
, **kwargs
)
216 # section_size must be even
217 sections
= pages
// config
.PDFEDIT_MAX_PAGES
+ 1
218 section_size
= (pages
// sections
+ 2) & ~
1
221 s
= kwargs
.pop('number_start', 1)
223 e
= s
+ section_size
- 1
224 pdf_section
= '%s-%s-%s.pdf' % (pdf
[:-4], s
, e
)
226 page_range
= '%s-%s' % (s
, e
)
228 page_range
= '%s-end' % s
236 self
._number
_pdf
(pdf_section
, number_start
=s
, **kwargs
)
237 pdf_sections
.append(pdf_section
)
240 concat_pdfs(pdf
, *pdf_sections
)
242 def make_barcode_pdf(self
, isbn
, pdf
, corner
='br'):
243 """Put an ISBN barcode in a corner of a single blank page."""
245 position
= '%s,%s,%s,%s,%s' % (corner
, self
.width
, self
.height
, self
.side_margin
, self
.bottom_margin
)
246 cmd1
= [config
.BOOKLAND
,
247 '--position', position
,
251 '-dDEVICEWIDTHPOINTS=%s' % self
.width
,
252 '-dDEVICEHEIGHTPOINTS=%s' % self
.height
,
255 p1
= Popen(cmd1
, stdout
=PIPE
)
256 p2
= Popen(cmd2
, stdin
=p1
.stdout
, stdout
=PIPE
, stderr
=PIPE
)
257 out
, err
= p2
.communicate()
259 log('ran:\n%s | %s' % (' '.join(cmd1
), ' '.join(cmd2
)))
260 log("return: %s and %s \nstdout:%s \nstderr:%s" % (p1
.poll(), p2
.poll(), out
, err
))
263 def count_pdf_pages(pdf
):
264 """How many pages in the PDF?"""
265 #XXX could also use python-pypdf or python-poppler
266 cmd
= ('pdfinfo', pdf
)
267 p
= Popen(cmd
, stdout
=PIPE
, stderr
=PIPE
)
268 out
, err
= p
.communicate()
269 m
= re
.search(r
'^\s*Pages:\s*(\d+)\s*$', out
, re
.MULTILINE
)
270 return int(m
.group(1))
273 def concat_pdfs(destination
, *pdfs
):
274 """Join all the named pdfs together into one and save it as <name>"""
276 cmd
.extend(x
for x
in pdfs
if x
is not None)
277 cmd
+= ['cat', 'output', destination
]
280 def rotate_pdf(pdfin
, pdfout
):
281 """Turn the PDF on its head"""
282 cmd
= ['pdftk', pdfin
,
290 def parse_extracted_outline(outline_file
, depth
=config
.CONTENTS_DEPTH
):
291 """Extract outline data from a file structured as follows:
293 The first line contains the text "Pages: ", followed by the total
294 number of pages in the PDF (all numbers are in ascii decimal
297 Each following line looks like this
299 "Level: " <level> " Page: " <page number> " Title: " <title> "\n"
301 where <level> is an integer indicating the significance of the
302 heading, <page number> is self-explanatory, and <title> is the
303 title encoded as utf-8 text that has been escaped as in a URI:
304 non-alphanumeric characters are replaced by "%" followed by two
305 hexadecimal digits giving their value (This escaping system is
306 variously called "url-encoding" or "percent-encoding" and is
307 described in section 2.1 of RFC 3986).
309 f
= open(outline_file
, 'r')
312 page_count
= int(re
.match('^Pages: (\d+)', page_line
).group(1))
316 m
= re
.match('Level: (\d+)\s+Page: (\d+)\s+Title: (\S*)', line
)
317 level
= int(m
.group(1))
320 pagenum
= int(m
.group(2))
321 title
= urllib
.unquote(m
.group(3)).strip()#.decode('utf-8')
323 log("WARNING: heading level %s on page %s is empty string" % (level
, pagenum
))
324 contents
.append((title
, level
, pagenum
))
326 return contents
, page_count
330 def parse_outline(pdf
, level_threshold
, debug_filename
=None):
331 """Create a structure reflecting the outline of a PDF.
332 A chapter heading looks like this:
334 BookmarkTitle: 2. What is sound?
336 BookmarkPageNumber: 3
338 cmd
= ('pdftk', pdf
, 'dump_data')
339 p
= Popen(cmd
, stdout
=PIPE
, stderr
=PIPE
)
340 outline
, err
= p
.communicate()
341 #log("OUTLINE:", outline)
342 if debug_filename
is not None:
344 f
= open(debug_filename
, 'w')
348 log("could not write to %s!" % debug_filename
)
350 lines
= (x
.strip() for x
in outline
.split('\n') if x
.strip())
354 return s
.strip(config
.WHITESPACE_AND_NULL
)
356 def extract(expected
, conv
=_strip
):
359 k
, v
= line
.split(':', 1)
363 log("trouble with line %r" %line
)
365 #There are a few useless variables, then the pagecount, then the contents.
366 #The pagecount is useful, so pick it up first.
368 while page_count
== None:
369 page_count
= extract('NumberOfPages', int)
373 title
= extract('BookmarkTitle')
374 if title
is not None:
375 level
= extract('BookmarkLevel', int)
376 pagenum
= extract('BookmarkPageNumber', int)
377 if level
<= level_threshold
and None not in (level
, pagenum
):
378 contents
.append((title
, level
, pagenum
))
379 except StopIteration:
382 return contents
, page_count