# -*- coding: utf-8 -*-
# text alignment program
# alcune idee tratte da Match and Split
import os, re, sys
# elimina l'escaping dei caratteri speciali nel djvu text
def unquote_text_from_djvu(text):
#text = text.replace(u'\\r', u'\r')
text = text.replace(u'\\n', u'\n')
text = text.replace(u'\\t', u' ')
text = text.replace(u'\\"', u'"')
text = text.replace(u'\\\\', u'\\')
text = text.replace(u'\\037', u'\n')
text = text.replace(u'\\035', u'')
text = text.replace(u'\\013', u'')
text = text.rstrip(u'\n')
return text
# estrae lo strato testo in datail page, e produce la lista data dei testi
# delle pagine (pagina 1 in data[0]);
# il file djvu deve essere scaricato nella cartella locale
def extract_djvu_text(filename):
print "extracting text layer"
if type(filename) == type(u''):
filename = filename.encode('utf-8')
data = []
os.environ['LANG'] = 'en_US.UTF8'
comando="djvutxt --detail=page %s text.txt" % (unicode(filename,"utf-8").encode("latin-1"))
result=os.system(comando)
print comando, result
text=open("text.txt").read()
n=1
for t in re.finditer(u'\((page -?\d+ -?\d+ -?\d+ -?\d+[ \n]+"(.*)"[ ]*|)\)\n', text):
t = unicode(t.group(1), 'utf-8', 'replace')
t = re.sub(u'^page \d+ \d+ \d+ \d+[ \n]+"', u'', t)
t = re.sub(u'"[ ]*$', u'', t)
t = unquote_text_from_djvu(t)
## # proviamo ad aggirare il bug logico
## if len(t)<150:
## t=""
#data.append(("<pagina %d>\n" % (n))+t)
data.append(t)
n+=1
# os.remove(filename)
return data
def extract_pdf_text(filename):
print "extracting text layer from pdf"
if type(filename) == type(u''):
filename = filename.encode('utf-8')
data = []
os.environ['LANG'] = 'en_US.UTF8'
comando="pdftotext -enc UTF-8 %s text.txt" % (filename)
result=os.system(comando)
print comando, result
text=open("text.txt").read()
text=unicode(text,"utf-8")
text=text.replace(u"¬ ","")
data=text.split(u"\x0c")
# os.remove(filename)
return data
def main(params):# filename,nomeBaseIndice=None):
if params[1].endswith(".pdf"):
txtname=params[1].replace(".pdf","_pdf.txt")
if params[2]==None:
params[2]="nomeBaseIndice.pdf"
data=extract_pdf_text(params[1])
elif params[1].endswith(".djvu"):
txtname=params[1].replace(".djvu","_djvu.txt")
if params[2]==None:
params[2]="nomeBaseIndice.djvu"
data=extract_djvu_text(params[1])
else:
print "Il file dev'essere pdf o djvu"
return
for i in range(len(data)):
data[i]=("==[[Pagina:%s/%d]]==\n"%(params[2],i+1))+data[i]
testo="\n".join(data)
open(txtname,"w").write(testo.encode("utf-8"))
return "Salvato ",txtname
if __name__ == "__main__":
params=sys.argv
main(params)
#