#!/usr/bin/env python
import os
import glob
import subprocess
import re
import optparse
def main():
parser = optparse.OptionParser(usage='Usage: %prog -i <source directory> <options> -o <output file>')
parser.add_option('-i', dest='djvu', action='store',\
help='the source djvu file to perfrom OCR on')
parser.add_option('-l', dest='lang', action='store', default='eng',\
help="OCR language (default: 'eng')" )
parser.add_option('-d', dest='debug', action='store_true', default=False,\
help='enable debugging information' )
parser.add_option('-t', dest='tess_out', action='store_true', default=False,\
help='enable tesseract output' )
parser.add_option('-c', dest='clean', action='store_true', default=False,\
help='use textcleaner and imagemagick to convert the image to bitonal black and white' )
parser.add_option('-o', dest='output', action='store',\
help='output a human readable text file to a given file path' )
parser.add_option('-u', dest='update', action='store_true', default=False,\
help='update the djvu file text layer' )
(opts, args) = parser.parse_args()
# check mandatory options
if opts.djvu is None:
print("The input file '-i' must be given\n")
parser.print_help()
exit(-1)
DjvuTesseract(opts)
class DjvuTesseract():
def command(self, command, out=False, err=False):
"""Use subprocess.Popen" to run a command on the terminal and return the s result
Required for python 2.6 since subprocess.check_output doesn't exist
This function will trash output unless you explicitly ask it not to
with quiet=False. This is so tesseract won't spam you with rubbish"""
if out:
std_out = subprocess.PIPE
else:
std_out = None
if not err:
std_err = subprocess.PIPE
else:
std_err = None
proc = subprocess.Popen(command, stdout = std_out, stderr=std_err)#std_out)
out, err = proc.communicate()
return out, err
def complete(self):
"""Prints a "complete" message if debugging is on"""
if self.opts.debug:
print("complete")
def calculate_djvu_length(self):
cmd = ['djvused', self.opts.djvu, '-e', 'n']
out, err = self.command(cmd, out=True)
self.num_pages = int(out)
if self.opts.debug:
print("(INF) number of pages: %d" % self.num_pages)
def format_ocr_text(self, page):
"""Format a page's OCR'd text into a DJVU friendly form"""
#read out of the text file that tesseract made
ocr_text = open(self.ocr_text, 'r', encoding='utf-8')
# write into this file
djvu_text = open( self.djvu_text, 'w', encoding='utf-8')
text = "(page 0 0 1 1\n"
self.out_text.write('\n## Page %d ###\n\n' % page )
for line in ocr_text:
#write to the human readable file
self.out_text.write(line)
# add each line of text
# escaping " to \" as we go
text += '(line 0 0 1 1 "%s")\n' % line.replace('\\', '\\\\').replace('"', '\\"').strip()
text += ")\n"
djvu_text.write( text )
ocr_text.close()
djvu_text.close()
def process_pages(self):
for page in range(1, self.num_pages+1): #djvu pages are 1-indexed
if self.opts.debug:
print("\n\t(INF) Processing page %d" % page)
if self.opts.debug:
print("\t(INF) Extracting DjVu page to image . . .")
# Extract page an image
cmd = ['ddjvu', '-format=tiff', '-page=%d' % page, self.opts.djvu, self.temp_img]
out, err = self.command(cmd)
self.complete()
#Cleanup image
if self.opts.clean:
if self.opts.debug:
print("\t(INF) Applying textcleaner . . .")
# apply text cleaner
cmd = ['./textcleaner', self.temp_img, self.temp_img]
out, err = self.command(cmd)
self.complete()
if self.opts.debug:
print("\t(INF) Applying bitonal conversion . . .")
# apply text cleaner
cmd = ['convert', self.temp_img, '-threshold', '50%', self.temp_img]
out, err = self.command(cmd)
self.complete()
if self.opts.debug:
print("\t(INF) Beginning OCR. . .")
# Perform OCR on the image
cmd = ['tesseract', self.temp_img, self.temp_ocr, '-l', self.opts.lang]
out, err = self.command(cmd, err=self.opts.tess_out)
self.complete()
# convert the OCR'd text to a DJVU friendly fomat and a human-friendly format
self.format_ocr_text(page)
# update the DJVU text layer
if self.opts.update:
if self.opts.debug:
print("\t(INF) Updating DJVU page . . .")
# replace the text in the DJVU file
cmd = ['djvused', self.opts.djvu, '-e', 'select %d; remove-txt' % page, "-s"]
out, err = self.command(cmd)
cmd = ['djvused', self.opts.djvu, '-e', 'select %d; set-txt %s'% (page, self.djvu_text), "-s"]
out, err = self.command(cmd)
self.complete()
def process_djvu(self):
if self.opts.debug:
print("(INF) Processing %s" % self.opts.djvu)
# calculate DJVU length
self.calculate_djvu_length()
self.process_pages()
def __init__(self, opts):
self.opts = opts
self.temp_img = "TESSERACT-OCR-TEMP.tif"
self.temp_ocr = "TESSERACT-OCR-TEMP" #tesseract adds .txt
self.ocr_text = self.temp_ocr + '.txt'
# file to dump pase-wise formatted OCR'd text into
self.djvu_text = "TESSERACT-OCR-TEMP.djvu.txt"
# file to dump human readable output into for the whole file
if self.opts.output:
output_filename = self.opts.output
else: #dump in /tmp/
output_filename = "TESSERACT-OCR-TEMP.output.txt"
self.out_text = open(output_filename, 'w', encoding='utf-8')
self.process_djvu()
if __name__ == "__main__":
try:
main()
finally:
None
"""
# note: structure which works
# print TXTDJVU "(page 0 0 1 1\n" ;
# print TXTDJVU " (line 0 0 1 1 \"toto\")\n" ;
# print TXTDJVU " (line 0 0 1 1 \"toto la la\")\n";
# print TXTDJVU ")\n" ;
"""