12 parser = optparse.OptionParser(usage='Usage: %prog -i <source directory> <options> -o <output file>')
13 parser.add_option('-i', dest='djvu', action='store',\
14 help='the source djvu file to perfrom OCR on')
15 parser.add_option('-l', dest='lang', action='store', default='eng',\
16 help="OCR language (default: 'eng')" )
17 parser.add_option('-d', dest='debug', action='store_true', default=False,\
18 help='enable debugging information' )
19 parser.add_option('-t', dest='tess_out', action='store_true', default=False,\
20 help='enable tesseract output' )
21 parser.add_option('-b', dest='bitonal', action='store',\
22 help='use imagemagick to convert the image to bitonal black and white, with a threshold given in %' )
23 parser.add_option('-o', dest='output', action='store',\
24 help='output a human readable text file to a given file path' )
25 parser.add_option('-u', dest='update', action='store_true', default=False,\
26 help='update the djvu file text layer' )
28 (opts, args) = parser.parse_args()
30 # check mandatory options
32 print("The input file '-i' must be given\n")
39 class DjvuTesseract():
41 def command(self, command, out=False, err=False):
42 """Use subprocess.Popen" to run a command on the terminal and return the s result
44 Required for python 2.6 since subprocess.check_output doesn't exist
46 This function will trash output unless you explicitly ask it not to
47 with quiet=False. This is so tesseract won't spam you with rubbish"""
50 std_out = subprocess.PIPE
55 std_err = subprocess.PIPE
60 proc = subprocess.Popen(command, stdout = std_out, stderr=std_err)#std_out)
61 out, err = proc.communicate()
65 def calculate_djvu_length(self):
67 cmd = ['djvused', self.opts.djvu, '-e', 'n']
68 out, err = self.command(cmd, True)
71 self.num_pages = int(out)
74 print "\t(INF) number of pages: %d\n" % self.num_pages
76 def format_ocr_text(self, page):
77 """Format a page's OCR'd text into a DJVU friendly form"""
79 #read out of the text file that tesseract made
80 ocr_text = open(self.ocr_text, 'r')
82 # write into this file
83 djvu_text = open( self.djvu_text, 'w' )
85 text = "(page 0 0 1 1\n"
87 self.out_text.write('\n## Page %d ###\n\n' % page )
91 #write to the human readable file
92 self.out_text.write(line)
94 # add each line of text
95 # escaping " to \" as we go
96 text += '(line 0 0 1 1 "%s")\n' % line.replace('"', r'\"').strip()
100 djvu_text.write( text )
105 def process_pages(self):
107 for page in range(1, self.num_pages+1): #djvu pages are 1-indexed
110 print "\tPerforming OCR on page %d" % page
112 # Extract page an image
113 cmd = ['ddjvu', '-format=tiff', '-page=%d' % page, self.opts.djvu, self.temp_img]
114 out, err = self.command(cmd)
116 #Convert to bitonal if required
117 if self.opts.bitonal:
119 print "\tApplying bitonal conversion"
121 cmd = ['convert', self.temp_img, '-threshold', self.opts.bitonal, self.temp_img]
122 out, err = self.command(cmd)
124 # Perform OCR on the image
125 cmd = ['tesseract', self.temp_img, self.temp_ocr, '-l', self.opts.lang]
126 out, err = self.command(cmd, err=self.opts.tess_out)
129 print "\t OCR complete"
131 # convert the OCR'd text to a DJVU friendly fomat and a human-friendly format
132 self.format_ocr_text(page)
134 # update the DJVU text layer
137 # replace the text in the DJVU file
138 cmd = ['djvused', self.opts.djvu, '-e', 'select %d; remove-txt' % page, "-s"]
139 out, err = self.command(cmd)
141 cmd = ['djvused', self.opts.djvu, '-e', 'select %d; set-txt %s'% (page, self.djvu_text), "-s"]
142 out, err = self.command(cmd)
144 def process_djvu(self):
147 print "(INF) Processing %s" % self.opts.djvu
149 # calculate DJVU length
150 self.calculate_djvu_length()
155 def __init__(self, opts):
158 self.temp_img = "/tmp/TESSERACT-OCR-TEMP.tif"
159 self.temp_ocr = "/tmp/TESSERACT-OCR-TEMP" #tesseract adds .txt
161 self.ocr_text = self.temp_ocr + '.txt'
163 # file to dump pase-wise formatted OCR'd text into
164 self.djvu_text = "/tmp/TESSERACT-OCR-TEMP.djvu.txt"
166 # file to dump human readable output into for the whole file
168 output_filename = self.opts.output
170 output_filename = "/tmp/TESSERACT-OCR-TEMP.output.txt"
172 self.out_text = open(output_filename, 'w')
176 if __name__ == "__main__":
183 # note: structure which works
184 # print TXTDJVU "(page 0 0 1 1\n" ;
185 # print TXTDJVU " (line 0 0 1 1 \"toto\")\n" ;
186 # print TXTDJVU " (line 0 0 1 1 \"toto la la\")\n";
187 # print TXTDJVU ")\n" ;