objavi/pdf.py

   1 # Part of Objavi2, which turns html manuals into books.
   2 # This deals with PDF and page specific concepts.
   3 #
   4 # Copyright (C) 2009 Douglas Bagnall
   5 #
   6 # This program is free software; you can redistribute it and/or modify
   7 # it under the terms of the GNU General Public License as published by
   8 # the Free Software Foundation; either version 2 of the License, or
   9 # (at your option) any later version.
  10 #
  11 # This program is distributed in the hope that it will be useful,
  12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 # GNU General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License along
  17 # with this program; if not, write to the Free Software Foundation, Inc.,
  18 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
  19
  20 """Fiddly stuff to do with pages and PDFs."""
  21
  22 import os, sys
  23 import re
  24 from subprocess import Popen, PIPE
  25 import urllib
  26
  27 from objavi import config
  28 from objavi.book_utils import log, run
  29 from objavi.cgi_utils import path2url
  30
  31 def find_containing_paper(w, h):
  32     for name, pw, ph in config.PAPER_SIZES:
  33         if pw >= w and ph >= h:
  34             mw = (pw - w) * 0.5
  35             mh = (ph - h) * 0.5
  36             return (name, mw, mh)
  37
  38     raise ValueError("page sized %.2fmm x %.2fmm won't fit on any paper!" %
  39                      (w * config.POINT_2_MM, h * config.POINT_2_MM))
  40
  41 class PageSettings(object):
  42     """Calculates and wraps commands for the generation and processing
  43     of PDFs"""
  44     def __init__(self, pointsize, **kwargs):
  45         # the formulas for default gutters, margins and column margins
  46         # are quite ad-hoc and certainly improvable.
  47         self.width, self.height = pointsize
  48         self.papersize, clipx, clipy = find_containing_paper(self.width, self.height)
  49         self.grey_scale = 'grey_scale' in kwargs
  50
  51         self.engine = kwargs.get('engine', config.DEFAULT_ENGINE)
  52         # All measurements in points unless otherwise stated
  53         # user interaction is in *mm*, but is converted in objavi2.py
  54         default_margin = (config.BASE_MARGIN + config.PROPORTIONAL_MARGIN * min(pointsize))
  55         default_gutter = (config.BASE_GUTTER + config.PROPORTIONAL_GUTTER * self.width)
  56
  57         self.top_margin = kwargs.get('top_margin', default_margin)
  58         self.side_margin = kwargs.get('side_margin', default_margin)
  59         self.bottom_margin = kwargs.get('bottom_margin', default_margin)
  60         self.gutter = kwargs.get('gutter', default_gutter)
  61
  62         self.columns = kwargs.get('columns', 1)
  63         if self.columns == 'auto': #default for newspapers is to work out columns
  64             self.columns = int(self.width // config.MIN_COLUMN_WIDTH)
  65
  66         self.column_margin = kwargs.get('column_margin',
  67                                         default_margin * 2 / (5.0 + self.columns))
  68
  69         self.number_bottom = self.bottom_margin - 0.6 * config.PAGE_NUMBER_SIZE
  70         self.number_margin = self.side_margin
  71
  72         # calculate margins in mm for browsers
  73         self.margins = []
  74         ## for m, clip in ((self.top_margin, clipy),
  75         ##                 (self.side_margin, clipx + 0.5 * self.gutter),
  76         ##                 (self.bottom_margin, clipy + 0.5 * config.PAGE_NUMBER_SIZE),
  77         ##                 (self.side_margin, clipx + 0.5 * self.gutter),
  78         ##                 ):
  79         ##     self.margins.append((m + clip) * config.POINT_2_MM)
  80         for m, clip in ((self.top_margin, 0),
  81                         (self.side_margin, 0.5 * self.gutter),
  82                         (self.bottom_margin, 0.5 * config.PAGE_NUMBER_SIZE),
  83                         (self.side_margin, 0.5 * self.gutter),
  84                         ):
  85             self.margins.append((m + clip) * config.POINT_2_MM)
  86
  87         if 'PDFGEN' in config.DEBUG_MODES:
  88             log("making PageSettings with:")
  89             for x in locals().iteritems():
  90                 log("%s: %s" % x, debug='PDFGEN')
  91             for x in dir(self):
  92                 if not x.startswith('__'):
  93                     log("self.%s: %s" % (x, getattr(self, x)), debug='PDFGEN')
  94
  95
  96
  97     def _webkit_command(self, html, pdf, outline=False, outline_file=None):
  98         m = [str(x) for x in self.margins]
  99         outline_args = ['--outline',  '--outline-depth', '2'] * outline
 100         if outline_file is not None:
 101             outline_args += ['--dump-outline', outline_file]
 102
 103         greyscale_args = ['-g'] * self.grey_scale
 104         quiet_args = ['-q']
 105         cmd = ([config.WKHTMLTOPDF] +
 106                quiet_args +
 107                ['--page-width', str(self.width * config.POINT_2_MM),
 108                 '--page-height', str(self.height * config.POINT_2_MM),
 109                 '-T', m[0], '-R', m[1], '-B', m[2], '-L', m[3],
 110                 #'--disable-smart-shrinking',
 111                 '-d', '100',
 112                 #'--zoom', '1.2',
 113                 ] +
 114                outline_args +
 115                greyscale_args +
 116                config.WKHTMLTOPDF_EXTRA_COMMANDS +
 117                [html, pdf])
 118         log(' '.join(cmd))
 119         return cmd
 120
 121
 122     def make_raw_pdf(self, html, pdf, outline=False, outline_file=None):
 123         html_url = path2url(html, full=True)
 124         func = getattr(self, '_%s_command' % self.engine)
 125         if self.columns == 1:
 126             cmd = func(html_url, pdf, outline=outline, outline_file=outline_file)
 127             run(cmd)
 128         else:
 129             printable_width = self.width - 2.0 * self.side_margin - self.gutter
 130             column_width = (printable_width - (self.columns - 1) * self.column_margin) / self.columns
 131             page_width = column_width + self.column_margin
 132             side_margin = self.column_margin * 0.5
 133             if 'PDFGEN' in config.DEBUG_MODES:
 134                 log("making columns with:")
 135                 for k, v in locals().iteritems():
 136                     log("%s: %r" % (k, v))
 137                 for k in ('width', 'side_margin', 'gutter', 'column_margin', 'columns', 'height'):
 138                     log("self.%s: %r" % (k, getattr(self, k)))
 139
 140             columnmaker = PageSettings((page_width, self.height),
 141                                        gutter=0, top_margin=self.top_margin,
 142                                        side_margin=side_margin,
 143                                        bottom_margin=self.bottom_margin,
 144                                        grey_scale=self.grey_scale,
 145                                        engine=self.engine
 146                                        )
 147
 148             column_pdf = pdf[:-4] + '-single-column.pdf'
 149             columnmaker.make_raw_pdf(html, column_pdf, outline=outline, outline_file=outline_file)
 150             columnmaker.reshape_pdf(column_pdf)
 151             cmd = ['pdfnup',
 152                    '--nup', '%sx1' % int(self.columns),
 153                    '--paper', self.papersize.lower() + 'paper',
 154                    '--outfile', pdf,
 155                    '--offset', '0 0', #'%scm 0' % (self.margins[1] * 0.1),
 156                    '--noautoscale', 'true',
 157                    '--orient', 'portrait',
 158                    #'--tidy', 'false',
 159                    column_pdf
 160                    ]
 161
 162             run(cmd)
 163
 164
 165
 166     def reshape_pdf(self, pdf, dir='LTR', centre_start=False, centre_end=False,
 167                     even_pages=True):
 168         """Spin the pdf for RTL text, resize it to the right size, and
 169         shift the gutter left and right"""
 170         ops = []
 171         if self.columns > 1:
 172             ops.append('resize')
 173         if self.gutter:
 174             ops.append('shift')
 175         if even_pages:
 176             ops.append('even_pages')
 177         gutter = self.gutter
 178         if dir == 'RTL':
 179             gutter = -gutter
 180         if not ops:
 181             return
 182
 183         cmd = ['pdfedit', '-s', 'wk_objavi.qs',
 184                'dir=%s' % dir,
 185                'filename=%s' % pdf,
 186                'output_filename=%s' % pdf,
 187                'operation=%s' % ','.join(ops),
 188                'width=%s' % self.width,
 189                'height=%s' % self.height,
 190                'offset=%s' % gutter,
 191                'centre_start=%s' % centre_start,
 192                'centre_end=%s' % centre_end,
 193                ]
 194         run(cmd)
 195
 196     def _number_pdf(self, pdf, numbers='latin', dir='LTR',
 197                     number_start=1):
 198         cmd = ['pdfedit', '-s', 'wk_objavi.qs',
 199                'operation=page_numbers',
 200                'dir=%s' % dir,
 201                'filename=%s' % pdf,
 202                'output_filename=%s' % pdf,
 203                'number_start=%s' % number_start,
 204                'number_style=%s' % numbers,
 205                'number_bottom=%s' % self.number_bottom,
 206                'number_margin=%s' % self.number_margin,
 207                ]
 208         run(cmd)
 209
 210     def number_pdf(self, pdf, pages, **kwargs):
 211         # if there are too many pages for pdfedit to handle in one go,
 212         # split the job into bits.  <pages> may not be exact
 213         if pages is None or pages <= config.PDFEDIT_MAX_PAGES:
 214             self._number_pdf(pdf, **kwargs)
 215         else:
 216             # section_size must be even
 217             sections = pages // config.PDFEDIT_MAX_PAGES + 1
 218             section_size = (pages // sections + 2) & ~1
 219
 220             pdf_sections = []
 221             s = kwargs.pop('number_start', 1)
 222             while s < pages:
 223                 e = s + section_size - 1
 224                 pdf_section = '%s-%s-%s.pdf' % (pdf[:-4], s, e)
 225                 if e < pages - 1:
 226                     page_range = '%s-%s' % (s, e)
 227                 else:
 228                     page_range = '%s-end' % s
 229                 run(['pdftk',
 230                      pdf,
 231                      'cat',
 232                      page_range,
 233                      'output',
 234                      pdf_section,
 235                      ])
 236                 self._number_pdf(pdf_section, number_start=s, **kwargs)
 237                 pdf_sections.append(pdf_section)
 238                 s = e + 1
 239
 240             concat_pdfs(pdf, *pdf_sections)
 241
 242     def make_barcode_pdf(self, isbn, pdf, corner='br'):
 243         """Put an ISBN barcode in a corner of a single blank page."""
 244
 245         position = '%s,%s,%s,%s,%s' % (corner, self.width, self.height, self.side_margin, self.bottom_margin)
 246         cmd1 = [config.BOOKLAND,
 247                 '--position', position,
 248                 str(isbn)]
 249         cmd2 = ['ps2pdf',
 250                 '-dFIXEDMEDIA',
 251                 '-dDEVICEWIDTHPOINTS=%s' % self.width,
 252                 '-dDEVICEHEIGHTPOINTS=%s' % self.height,
 253                 '-', pdf]
 254
 255         p1 = Popen(cmd1, stdout=PIPE)
 256         p2 = Popen(cmd2, stdin=p1.stdout, stdout=PIPE, stderr=PIPE)
 257         out, err = p2.communicate()
 258
 259         log('ran:\n%s | %s' % (' '.join(cmd1), ' '.join(cmd2)))
 260         log("return: %s and %s \nstdout:%s \nstderr:%s" % (p1.poll(), p2.poll(), out, err))
 261
 262
 263 def count_pdf_pages(pdf):
 264     """How many pages in the PDF?"""
 265     #XXX could also use python-pypdf or python-poppler
 266     cmd = ('pdfinfo', pdf)
 267     p = Popen(cmd, stdout=PIPE, stderr=PIPE)
 268     out, err = p.communicate()
 269     m = re.search(r'^\s*Pages:\s*(\d+)\s*$', out, re.MULTILINE)
 270     return int(m.group(1))
 271
 272
 273 def concat_pdfs(destination, *pdfs):
 274     """Join all the named pdfs together into one and save it as <name>"""
 275     cmd = ['pdftk']
 276     cmd.extend(x for x in pdfs if x is not None)
 277     cmd += ['cat', 'output', destination]
 278     run(cmd)
 279
 280 def rotate_pdf(pdfin, pdfout):
 281     """Turn the PDF on its head"""
 282     cmd = ['pdftk', pdfin,
 283            'cat',
 284            '1-endD',
 285            'output',
 286            pdfout
 287            ]
 288     run(cmd)
 289
 290 def parse_extracted_outline(outline_file, depth=config.CONTENTS_DEPTH):
 291     """Extract outline data from a file structured as follows:
 292
 293     The first line contains the text "Pages: ", followed by the total
 294     number of pages in the PDF (all numbers are in ascii decimal
 295     digits).
 296
 297     Each following line looks like this
 298
 299     "Level: " <level> "  Page: "  <page number> "  Title: " <title> "\n"
 300
 301     where <level> is an integer indicating the significance of the
 302     heading, <page number> is self-explanatory, and <title> is the
 303     title encoded as utf-8 text that has been escaped as in a URI:
 304     non-alphanumeric characters are replaced by "%" followed by two
 305     hexadecimal digits giving their value (This escaping system is
 306     variously called "url-encoding" or "percent-encoding" and is
 307     described in section 2.1 of RFC 3986).
 308     """
 309     f = open(outline_file, 'r')
 310     page_line = f.next()
 311     log(page_line)
 312     page_count = int(re.match('^Pages: (\d+)', page_line).group(1))
 313
 314     contents = []
 315     for line in f:
 316         m = re.match('Level: (\d+)\s+Page: (\d+)\s+Title: (\S*)', line)
 317         level = int(m.group(1))
 318         if level > depth:
 319             continue
 320         pagenum = int(m.group(2))
 321         title = urllib.unquote(m.group(3)).strip()#.decode('utf-8')
 322         if not title:
 323             log("WARNING: heading level %s on page %s is empty string" % (level, pagenum))
 324         contents.append((title, level, pagenum))
 325
 326     return contents, page_count
 327
 328
 329
 330 def parse_outline(pdf, level_threshold, debug_filename=None):
 331     """Create a structure reflecting the outline of a PDF.
 332     A chapter heading looks like this:
 333
 334     BookmarkTitle: 2. What is sound?
 335     BookmarkLevel: 1
 336     BookmarkPageNumber: 3
 337     """
 338     cmd = ('pdftk', pdf, 'dump_data')
 339     p = Popen(cmd, stdout=PIPE, stderr=PIPE)
 340     outline, err = p.communicate()
 341     #log("OUTLINE:", outline)
 342     if debug_filename is not None:
 343         try:
 344             f = open(debug_filename, 'w')
 345             f.write(outline)
 346             f.close()
 347         except IOError:
 348             log("could not write to %s!" % debug_filename)
 349
 350     lines = (x.strip() for x in outline.split('\n') if x.strip())
 351     contents = []
 352
 353     def _strip(s):
 354         return s.strip(config.WHITESPACE_AND_NULL)
 355
 356     def extract(expected, conv=_strip):
 357         line = lines.next()
 358         try:
 359             k, v = line.split(':', 1)
 360             if k == expected:
 361                 return conv(v)
 362         except ValueError:
 363             log("trouble with line %r" %line)
 364
 365     #There are a few useless variables, then the pagecount, then the contents.
 366     #The pagecount is useful, so pick it up first.
 367     page_count = None
 368     while page_count == None:
 369         page_count = extract('NumberOfPages', int)
 370
 371     try:
 372         while True:
 373             title = extract('BookmarkTitle')
 374             if title is not None:
 375                 level = extract('BookmarkLevel', int)
 376                 pagenum = extract('BookmarkPageNumber', int)
 377                 if level <= level_threshold and None not in (level, pagenum):
 378                     contents.append((title, level, pagenum))
 379     except StopIteration:
 380         pass
 381
 382     return contents, page_count