objavi/pdf.py

   1 # Part of Objavi2, which turns html manuals into books.
   2 # This deals with PDF and page specific concepts.
   3 #
   4 # Copyright (C) 2009 Douglas Bagnall
   5 #
   6 # This program is free software; you can redistribute it and/or modify
   7 # it under the terms of the GNU General Public License as published by
   8 # the Free Software Foundation; either version 2 of the License, or
   9 # (at your option) any later version.
  10 #
  11 # This program is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 # GNU General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License along
  17 # with this program; if not, write to the Free Software Foundation, Inc.,
  18 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  19
  20 """Fiddly stuff to do with pages and PDFs."""
  21
  22 import os, sys
  23 import re
  24 from subprocess import Popen, PIPE
  25
  26 from objavi import config
  27 from objavi.book_utils import log, run
  28
  29
  30 def find_containing_paper(w, h):
  31     for name, pw, ph in config.PAPER_SIZES:
  32         if pw >= w and ph >= h:
  33             mw = (pw - w) * 0.5
  34             mh = (ph - h) * 0.5
  35             return (name, mw, mh)
  36
  37     raise ValueError("page sized %.2fmm x %.2fmm won't fit on any paper!" %
  38                      (w * config.POINT_2_MM, h * config.POINT_2_MM))
  39
  40 class PageSettings(object):
  41     """Calculates and wraps commands for the generation and processing
  42     of PDFs"""
  43     def __init__(self, pointsize, **kwargs):
  44         # the formulas for default gutters, margins and column margins
  45         # are quite ad-hoc and certainly improvable.
  46         self.width, self.height = pointsize
  47         self.papersize, clipx, clipy = find_containing_paper(self.width, self.height)
  48         self.grey_scale = 'grey_scale' in kwargs
  49
  50         self.engine = kwargs.get('engine', config.DEFAULT_ENGINE)
  51         # All measurements in points unless otherwise stated
  52         # user interaction is in *mm*, but is converted in objavi2.py
  53         default_margin = (config.BASE_MARGIN + config.PROPORTIONAL_MARGIN * min(pointsize))
  54         default_gutter = (config.BASE_GUTTER + config.PROPORTIONAL_GUTTER * self.width)
  55
  56         self.top_margin = kwargs.get('top_margin', default_margin)
  57         self.side_margin = kwargs.get('side_margin', default_margin)
  58         self.bottom_margin = kwargs.get('bottom_margin', default_margin)
  59         self.gutter = kwargs.get('gutter', default_gutter)
  60
  61         self.columns = kwargs.get('columns', 1)
  62         if self.columns == 'auto': #default for newspapers is to work out columns
  63             self.columns = int(self.width // config.MIN_COLUMN_WIDTH)
  64
  65         self.column_margin = kwargs.get('column_margin',
  66                                         default_margin * 2 / (5.0 + self.columns))
  67
  68         self.number_bottom = self.bottom_margin - 0.6 * config.PAGE_NUMBER_SIZE
  69         self.number_margin = self.side_margin
  70
  71         # calculate margins in mm for browsers
  72         self.margins = []
  73         for m, clip in ((self.top_margin, clipy),
  74                         (self.side_margin, clipx + 0.5 * self.gutter),
  75                         (self.bottom_margin, clipy + 0.5 * config.PAGE_NUMBER_SIZE),
  76                         (self.side_margin, clipx + 0.5 * self.gutter),
  77                         ):
  78             self.margins.append((m + clip) * config.POINT_2_MM)
  79
  80         self.moz_printer = kwargs.get('moz_printer', ('objavi_' + self.papersize))
  81
  82         if 'PDFGEN' in config.DEBUG_MODES:
  83             log("making PageSettings with:")
  84             for x in locals().iteritems():
  85                 log("%s: %s" % x, debug='PDFGEN')
  86             for x in dir(self):
  87                 if not x.startswith('__'):
  88                     log("self.%s: %s" % (x, getattr(self, x)), debug='PDFGEN')
  89
  90
  91
  92     def _webkit_command(self, html, pdf, outline=False):
  93         m = [str(x) for x in self.margins]
  94         outline_args = ['--outline',  '--outline-depth', '2'] * outline
  95         greyscale_args = ['-g'] * self.grey_scale
  96         cmd = ([config.WKHTMLTOPDF, '-q', '-s', self.papersize,
  97                '-T', m[0], '-R', m[1], '-B', m[2], '-L', m[3],
  98                '-d', '100'] + outline_args + greyscale_args +
  99                config.WKHTMLTOPDF_EXTRA_COMMANDS + [html, pdf])
 100         log(' '.join(cmd))
 101         return cmd
 102
 103     def _gecko_command(self, html, pdf, outline=False):
 104         m = [str(x) for x in self.margins]
 105         #firefox -P pdfprint -print URL -printprinter "printer_settings"
 106         cmd = [config.FIREFOX, '-P', 'pdfprint', '-print',
 107                html, '-printprinter', self.moz_printer]
 108         log(' '.join(cmd))
 109         return cmd
 110
 111     def make_raw_pdf(self, html, pdf, outline=False):
 112         func = getattr(self, '_%s_command' % self.engine)
 113         if self.columns == 1:
 114             cmd = func(html, pdf, outline=outline)
 115             run(cmd)
 116         else:
 117             printable_width = self.width - 2.0 * self.side_margin - self.gutter
 118             column_width = (printable_width - (self.columns - 1) * self.column_margin) / self.columns
 119             page_width = column_width + self.column_margin
 120             side_margin = self.column_margin * 0.5
 121             if 'PDFGEN' in config.DEBUG_MODES:
 122                 log("making columns with:")
 123                 for k, v in locals().iteritems():
 124                     log("%s: %r" % (k, v))
 125                 for k in ('width', 'side_margin', 'gutter', 'column_margin', 'columns', 'height'):
 126                     log("self.%s: %r" % (k, getattr(self, k)))
 127
 128             columnmaker = PageSettings((page_width, self.height), moz_printer=self.moz_printer,
 129                                        gutter=0, top_margin=self.top_margin,
 130                                        side_margin=side_margin,
 131                                        bottom_margin=self.bottom_margin,
 132                                        grey_scale=self.grey_scale,
 133                                        engine=self.engine
 134                                        )
 135
 136             column_pdf = pdf[:-4] + '-single-column.pdf'
 137             columnmaker.make_raw_pdf(html, column_pdf, outline=outline)
 138             columnmaker.reshape_pdf(column_pdf)
 139             cmd = ['pdfnup',
 140                    '--nup', '%sx1' % int(self.columns),
 141                    '--paper', self.papersize.lower() + 'paper',
 142                    '--outfile', pdf,
 143                    '--offset', '0 0', #'%scm 0' % (self.margins[1] * 0.1),
 144                    '--noautoscale', 'true',
 145                    '--orient', 'portrait',
 146                    #'--tidy', 'false',
 147                    column_pdf
 148                    ]
 149
 150             run(cmd)
 151
 152
 153
 154     def reshape_pdf(self, pdf, dir='LTR', centre_start=False, centre_end=False,
 155                     even_pages=True):
 156         """Spin the pdf for RTL text, resize it to the right size, and
 157         shift the gutter left and right"""
 158         ops = 'resize'
 159         if self.gutter:
 160             ops += ',shift'
 161         if even_pages:
 162             ops += ',even_pages'
 163         gutter = self.gutter
 164         if dir == 'RTL':
 165             gutter = -gutter
 166         cmd = ['pdfedit', '-s', 'wk_objavi.qs',
 167                'dir=%s' % dir,
 168                'filename=%s' % pdf,
 169                'output_filename=%s' % pdf,
 170                'operation=%s' % ops,
 171                'width=%s' % self.width,
 172                'height=%s' % self.height,
 173                'offset=%s' % gutter,
 174                'centre_start=%s' % centre_start,
 175                'centre_end=%s' % centre_end,
 176                ]
 177         run(cmd)
 178
 179     def _number_pdf(self, pdf, numbers='latin', dir='LTR',
 180                     number_start=1):
 181         cmd = ['pdfedit', '-s', 'wk_objavi.qs',
 182                'operation=page_numbers',
 183                'dir=%s' % dir,
 184                'filename=%s' % pdf,
 185                'output_filename=%s' % pdf,
 186                'number_start=%s' % number_start,
 187                'number_style=%s' % numbers,
 188                'number_bottom=%s' % self.number_bottom,
 189                'number_margin=%s' % self.number_margin,
 190                ]
 191         run(cmd)
 192
 193     def number_pdf(self, pdf, pages, **kwargs):
 194         # if there are too many pages for pdfedit to handle in one go,
 195         # split the job into bits.  <pages> may not be exact
 196         if pages is None or pages <= config.PDFEDIT_MAX_PAGES:
 197             self._number_pdf(pdf, **kwargs)
 198         else:
 199             # section_size must be even
 200             sections = pages // config.PDFEDIT_MAX_PAGES + 1
 201             section_size = (pages // sections + 2) & ~1
 202
 203             pdf_sections = []
 204             s = kwargs.pop('number_start', 1)
 205             while s < pages:
 206                 e = s + section_size - 1
 207                 pdf_section = '%s-%s-%s.pdf' % (pdf[:-4], s, e)
 208                 if e < pages - 1:
 209                     page_range = '%s-%s' % (s, e)
 210                 else:
 211                     page_range = '%s-end' % s
 212                 run(['pdftk',
 213                      pdf,
 214                      'cat',
 215                      page_range,
 216                      'output',
 217                      pdf_section,
 218                      ])
 219                 self._number_pdf(pdf_section, number_start=s, **kwargs)
 220                 pdf_sections.append(pdf_section)
 221                 s = e + 1
 222
 223             concat_pdfs(pdf, *pdf_sections)
 224
 225     def make_barcode_pdf(self, isbn, pdf, corner='br'):
 226         """Put an ISBN barcode in a corner of a single blank page."""
 227
 228         position = '%s,%s,%s,%s,%s' % (corner, self.width, self.height, self.side_margin, self.bottom_margin)
 229         cmd1 = [config.BOOKLAND,
 230                 '--position', position,
 231                 str(isbn)]
 232         cmd2 = ['ps2pdf',
 233                 '-dFIXEDMEDIA',
 234                 '-dDEVICEWIDTHPOINTS=%s' % self.width,
 235                 '-dDEVICEHEIGHTPOINTS=%s' % self.height,
 236                 '-', pdf]
 237
 238         p1 = Popen(cmd1, stdout=PIPE)
 239         p2 = Popen(cmd2, stdin=p1.stdout, stdout=PIPE, stderr=PIPE)
 240         out, err = p2.communicate()
 241
 242         log('ran:\n%s | %s' % (' '.join(cmd1), ' '.join(cmd2)))
 243         log("return: %s and %s \nstdout:%s \nstderr:%s" % (p1.poll(), p2.poll(), out, err))
 244
 245
 246 def count_pdf_pages(pdf):
 247     """How many pages in the PDF?"""
 248     #XXX could also use python-pypdf or python-poppler
 249     cmd = ('pdfinfo', pdf)
 250     p = Popen(cmd, stdout=PIPE, stderr=PIPE)
 251     out, err = p.communicate()
 252     m = re.search(r'^\s*Pages:\s*(\d+)\s*$', out, re.MULTILINE)
 253     return int(m.group(1))
 254
 255
 256 def concat_pdfs(destination, *pdfs):
 257     """Join all the named pdfs together into one and save it as <name>"""
 258     cmd = ['pdftk']
 259     cmd.extend(x for x in pdfs if x is not None)
 260     cmd += ['cat', 'output', destination]
 261     run(cmd)
 262
 263 def rotate_pdf(pdfin, pdfout):
 264     """Turn the PDF on its head"""
 265     cmd = ['pdftk', pdfin,
 266            'cat',
 267            '1-endD',
 268            'output',
 269            pdfout
 270            ]
 271     run(cmd)
 272
 273 def parse_outline(pdf, level_threshold, debug_filename=None):
 274     """Create a structure reflecting the outline of a PDF.
 275     A chapter heading looks like this:
 276
 277     BookmarkTitle: 2. What is sound?
 278     BookmarkLevel: 1
 279     BookmarkPageNumber: 3
 280     """
 281     cmd = ('pdftk', pdf, 'dump_data')
 282     p = Popen(cmd, stdout=PIPE, stderr=PIPE)
 283     outline, err = p.communicate()
 284     log("OUTLINE:", outline)
 285     if debug_filename is not None:
 286         try:
 287             f = open(debug_filename, 'w')
 288             f.write(outline)
 289             f.close()
 290         except IOError:
 291             log("could not write to %s!" % debug_filename)
 292
 293     lines = (x.strip() for x in outline.split('\n') if x.strip())
 294     contents = []
 295
 296     def _strip(s):
 297         return s.strip(config.WHITESPACE_AND_NULL)
 298
 299     def extract(expected, conv=_strip):
 300         line = lines.next()
 301         try:
 302             k, v = line.split(':', 1)
 303             if k == expected:
 304                 return conv(v)
 305         except ValueError:
 306             log("trouble with line %r" %line)
 307
 308     #There are a few useless variables, then the pagecount, then the contents.
 309     #The pagecount is useful, so pick it up first.
 310     page_count = None
 311     while page_count == None:
 312         page_count = extract('NumberOfPages', int)
 313
 314     try:
 315         while True:
 316             title = extract('BookmarkTitle')
 317             if title is not None:
 318                 level = extract('BookmarkLevel', int)
 319                 pagenum = extract('BookmarkPageNumber', int)
 320                 if level <= level_threshold and None not in (level, pagenum):
 321                     contents.append((title, level, pagenum))
 322     except StopIteration:
 323         pass
 324
 325     return contents, outline, page_count