djvuocr/djvuocr

   1 #!/usr/bin/env python2
   2
   3 import os
   4 import glob
   5 import subprocess
   6 import re
   7 import optparse
   8
   9
  10 def main():
  11
  12     parser = optparse.OptionParser(usage='Usage: %prog -i <source directory> <options> -o <output file>')
  13     parser.add_option('-i', dest='djvu', action='store',\
  14                              help='the source djvu file to perfrom OCR on')
  15     parser.add_option('-l', dest='lang', action='store', default='eng',\
  16                              help="OCR language (default: 'eng')" )
  17     parser.add_option('-d', dest='debug', action='store_true', default=False,\
  18                              help='enable debugging information' )
  19     parser.add_option('-t', dest='tess_out', action='store_true', default=False,\
  20                              help='enable tesseract output' )
  21     parser.add_option('-b', dest='bitonal', action='store',\
  22                              help='use imagemagick to convert the image to bitonal black and white, with a threshold given in %' )
  23     parser.add_option('-o', dest='output', action='store',\
  24                              help='output a human readable text file to a given file path' )
  25     parser.add_option('-u', dest='update', action='store_true', default=False,\
  26                              help='update the djvu file text layer' )
  27
  28     (opts, args) = parser.parse_args()
  29
  30     # check mandatory options
  31     if opts.djvu is None:
  32         print("The input file '-i' must be given\n")
  33         parser.print_help()
  34         exit(-1)
  35
  36     DjvuTesseract(opts)
  37
  38
  39 class DjvuTesseract():
  40
  41     def command(self, command, out=False, err=False):
  42         """Use subprocess.Popen" to run a command on the terminal and return the s result
  43
  44         Required for python 2.6 since subprocess.check_output doesn't exist
  45
  46         This function will trash output unless you explicitly ask it not to
  47         with quiet=False. This is so tesseract won't spam you with rubbish"""
  48
  49         if out:
  50             std_out = subprocess.PIPE
  51         else:
  52             std_out = None
  53
  54         if not err:
  55             std_err = subprocess.PIPE
  56         else:
  57             std_err = None
  58
  59
  60         proc = subprocess.Popen(command, stdout = std_out,  stderr=std_err)#std_out)
  61         out, err = proc.communicate()
  62
  63         return out, err
  64
  65     def calculate_djvu_length(self):
  66
  67         cmd = ['djvused', self.opts.djvu, '-e', 'n']
  68         out, err = self.command(cmd, True)
  69
  70
  71         self.num_pages = int(out)
  72
  73         if self.opts.debug:
  74             print "\t(INF) number of pages: %d\n" % self.num_pages
  75
  76     def format_ocr_text(self, page):
  77         """Format a page's OCR'd text into a DJVU friendly form"""
  78
  79         #read out of the text file that tesseract made
  80         ocr_text = open(self.ocr_text, 'r')
  81
  82         # write into this file
  83         djvu_text = open( self.djvu_text, 'w' )
  84
  85         text = "(page 0 0 1 1\n"
  86
  87         self.out_text.write('\n## Page %d ###\n\n' % page )
  88
  89         for line in ocr_text:
  90
  91             #write to the human readable file
  92             self.out_text.write(line)
  93
  94             # add each line of text
  95             # escaping " to \" as we go
  96             text += '(line 0 0 1 1 "%s")\n' % line.replace('"', r'\"').strip()
  97
  98         text += ")\n"
  99
 100         djvu_text.write( text )
 101
 102         ocr_text.close()
 103         djvu_text.close()
 104
 105     def process_pages(self):
 106
 107         for page in range(1, self.num_pages+1): #djvu pages are 1-indexed
 108
 109             if self.opts.debug:
 110                 print "\tPerforming OCR on page %d" % page
 111
 112             # Extract page an image
 113             cmd = ['ddjvu', '-format=tiff', '-page=%d' % page, self.opts.djvu, self.temp_img]
 114             out, err = self.command(cmd)
 115
 116             #Convert to bitonal if required
 117             if self.opts.bitonal:
 118                 if self.opts.debug:
 119                     print "\tApplying bitonal conversion"
 120
 121                 cmd = ['convert', self.temp_img, '-threshold', self.opts.bitonal, self.temp_img]
 122                 out, err = self.command(cmd)
 123
 124             # Perform OCR on the image
 125             cmd = ['tesseract', self.temp_img, self.temp_ocr, '-l', self.opts.lang]
 126             out, err = self.command(cmd, err=self.opts.tess_out)
 127
 128             if self.opts.debug:
 129                 print "\t OCR complete"
 130
 131             # convert the OCR'd text to a DJVU friendly fomat and a human-friendly format
 132             self.format_ocr_text(page)
 133
 134             # update the DJVU text layer
 135             if self.opts.update:
 136
 137                 # replace the text in the DJVU file
 138                 cmd = ['djvused', self.opts.djvu, '-e', 'select %d; remove-txt' % page, "-s"]
 139                 out, err = self.command(cmd)
 140
 141                 cmd = ['djvused', self.opts.djvu, '-e', 'select %d; set-txt %s'% (page, self.djvu_text), "-s"]
 142                 out, err = self.command(cmd)
 143
 144     def process_djvu(self):
 145
 146         if self.opts.debug:
 147             print "(INF) Processing %s" % self.opts.djvu
 148
 149         # calculate DJVU length
 150         self.calculate_djvu_length()
 151
 152         self.process_pages()
 153
 154
 155     def __init__(self, opts):
 156         self.opts = opts
 157
 158         self.temp_img = "/tmp/TESSERACT-OCR-TEMP.tif"
 159         self.temp_ocr = "/tmp/TESSERACT-OCR-TEMP" #tesseract adds .txt
 160
 161         self.ocr_text = self.temp_ocr + '.txt'
 162
 163         # file to dump pase-wise formatted OCR'd text into
 164         self.djvu_text = "/tmp/TESSERACT-OCR-TEMP.djvu.txt"
 165
 166         # file to dump human readable output into for the whole file
 167         if self.opts.output:
 168             output_filename = self.opts.output
 169         else: #dump in /tmp/
 170             output_filename = "/tmp/TESSERACT-OCR-TEMP.output.txt"
 171
 172         self.out_text = open(output_filename, 'w')
 173
 174         self.process_djvu()
 175
 176 if __name__ == "__main__":
 177     try:
 178         main()
 179     finally:
 180         None
 181
 182 """
 183 # note: structure which works
 184 # print TXTDJVU "(page 0 0 1 1\n" ;
 185 #   print TXTDJVU "     (line 0 0 1 1 \"toto\")\n" ;
 186 #   print TXTDJVU "     (line 0 0 1 1 \"toto la la\")\n";
 187 #   print TXTDJVU ")\n" ;
 188 """