Progetto:Bot/Programmi in Python per i bot/textExtract.py

# -*- coding: utf-8 -*-
# text alignment program
# alcune idee tratte da Match and Split

import os, re, sys


# elimina l'escaping dei caratteri speciali nel djvu text
def unquote_text_from_djvu(text):
    #text = text.replace(u'\\r', u'\r')
    text = text.replace(u'\\n', u'\n')
    text = text.replace(u'\\t', u' ')
    text = text.replace(u'\\"', u'"')
    text = text.replace(u'\\\\', u'\\')
    text = text.replace(u'\\037', u'\n')
    text = text.replace(u'\\035', u'')
    text = text.replace(u'\\013', u'')
    text = text.rstrip(u'\n')
    return text
# estrae lo strato testo in datail page, e produce la lista data dei testi 
# delle pagine (pagina 1 in data[0]);
# il file djvu deve essere scaricato nella cartella locale
def extract_djvu_text(filename):
    print "extracting text layer"

    if type(filename) == type(u''):
        filename = filename.encode('utf-8')


    data = []

    os.environ['LANG'] = 'en_US.UTF8'

    comando="djvutxt --detail=page %s  text.txt" % (unicode(filename,"utf-8").encode("latin-1"))
    result=os.system(comando)
    print comando, result
    text=open("text.txt").read()
    n=1
    for t in re.finditer(u'\((page -?\d+ -?\d+ -?\d+ -?\d+[ \n]+"(.*)"[ ]*|)\)\n', text):
        t = unicode(t.group(1), 'utf-8', 'replace')
        t = re.sub(u'^page \d+ \d+ \d+ \d+[ \n]+"', u'', t)
        t = re.sub(u'"[ ]*$', u'', t)
        t = unquote_text_from_djvu(t)
##        # proviamo ad aggirare il bug logico
##        if len(t)<150:
##            t=""
        #data.append(("<pagina %d>\n" % (n))+t)
        data.append(t)
        n+=1

    # os.remove(filename)

    return data

def extract_pdf_text(filename):
    print "extracting text layer from pdf"

    if type(filename) == type(u''):
        filename = filename.encode('utf-8')

    data = []

    os.environ['LANG'] = 'en_US.UTF8'

    comando="pdftotext -enc UTF-8 %s  text.txt" % (filename)
    result=os.system(comando)
    print comando, result
    text=open("text.txt").read()
    text=unicode(text,"utf-8")
    text=text.replace(u"¬ ","")
    data=text.split(u"\x0c")

    # os.remove(filename)

    return data

def main(params):# filename,nomeBaseIndice=None):
    if params[1].endswith(".pdf"):
        txtname=params[1].replace(".pdf","_pdf.txt")
        if params[2]==None:
            params[2]="nomeBaseIndice.pdf"
        data=extract_pdf_text(params[1])
    elif params[1].endswith(".djvu"):
        txtname=params[1].replace(".djvu","_djvu.txt")
        if params[2]==None:
            params[2]="nomeBaseIndice.djvu"
        data=extract_djvu_text(params[1])
    else:
        print "Il file dev'essere pdf o djvu"
        return
    for i in range(len(data)):
        data[i]=("==[[Pagina:%s/%d]]==\n"%(params[2],i+1))+data[i]
    
    testo="\n".join(data)
    open(txtname,"w").write(testo.encode("utf-8"))
    return "Salvato ",txtname


if __name__ == "__main__":
    params=sys.argv
    main(params)

#