Progetto:Bot/Programmi in Python per i bot/jp2todjvu.py

#!/usr/bin/python
# -*- coding: utf-8  -*-

import os, shutil, urlparse, urllib
import zipfile
import argparse

from PIL import Image
from internetarchive import download

# How to use it

# Dependencies:
#* PIL (Pillow version: http://pillow.readthedocs.io/en/3.1.x/installation.html) built with OpenJPEG support (for JPEG2000)
#* internetarchive: https://pypi.python.org/pypi/internetarchive
#* djvuLibre: http://djvu.sourceforge.net/

#1. create a working folder on your pc e copiare il codice con il nome djvuCl.py
#2. copy the following script with the name djvuCl.py
#3. go into folder and verify djvuLibre, PIL and internetarchive are reachable
#4. run the script: python djvuCl.py ID_ARCHIVE


# Technical details:

# input: IA_identifier
# files: IA_identifier.pdf
#        IA_identifier_djvu.xml
# routines esterne: cjb2, djvm, djvuxmlparser, pdfimages
# nome pagine: IA_identifier_0000.djvu, IA_identifier_0001.djvu....

# cartella jp2: jp2
# cartella jpg: jpg
# cartella djvu individuali: djvu
# cartella input: input
# cartella output: output


def path2url(path):
        return urlparse.urljoin('file:', urllib.pathname2url(path))

def cleanfolder(dirpath):
        if not os.path.isdir(dirpath):
                os.mkdir(dirpath)

        for filename in os.listdir(dirpath):
                filepath = os.path.join(dirpath, filename)
                try:
                        shutil.rmtree(filepath)
                except OSError:
                        os.remove(filepath)

def dezip(zipf):
        cleanfolder("jp2")
        z = zipfile.ZipFile(os.path.join("input", zipf))
        for f in z.namelist():
                jp2 = f.split("/").pop()
                if jp2.endswith(".jp2"):
                        data = z.read(f)
                        open(os.path.join("jp2", jp2), "wb").write(data)
                print jp2, " saved"

def downloadItem(IAid):
        cleanfolder("input")
        download(IAid,glob_pattern="*_djvu.xml",destdir="input", verbose=True,no_directory=True)
        download(IAid,glob_pattern="*_jp2.zip",destdir="input", verbose=True,no_directory=True)

def jp2tojpg(fileformat="jpg"):
    if fileformat not in ("jpg", "pbm"):
        raise ValueError("Formato file intermedio non supportato")
    cleanfolder("jpg")
    cleanfolder("pbm")
    listaJp2 = os.listdir("jp2")
    listaJp2.sort()
    for f in range(len(listaJp2)):
        if listaJp2[f].endswith(".jp2"):
            fout = "%s.%s" % (listaJp2[f][0:-4], fileformat)
            image = Image.open(os.path.join("jp2", listaJp2[f]))
            if f == 0 and image.size[0] < 1000:
                    fattore=1024.0/image.size[0]
                    image=image.resize((int(image.size[0]*fattore),int(image.size[1]*fattore)))
            image.save(os.path.join(fileformat, fout))
            #comando="convert jp2/%s jpg/%s" % (listaJp2[f], fout)
            #res = os.system(comando)
            print fout, " salvata"

def jpgtodjvu(fileformat="jpg"):
    if fileformat not in ("jpg", "pbm"):
        raise ValueError("Formato file intermedio non supportato")
    cleanfolder("djvu")
    listaImmagini = os.listdir(fileformat)
    for f in listaImmagini:
        if f.endswith("." + fileformat):
            comando = "c44 %s %s" % (os.path.join(fileformat, f),
                os.path.join("djvu", f[0:-4] + ".djvu"))
            res = os.system(comando)
            print res,comando

def merge(pathdjvu="djvu"):
        cleanfolder("output")
        listaDjvu=os.listdir(pathdjvu)
        listaDjvu.sort()
        lista=""
        for n in range(len(listaDjvu)):
                if listaDjvu[n].endswith(".djvu"):
                        lista+=os.path.join("djvu",listaDjvu[n])+" "
                if len(lista)>7500:
                        break

        djvuBundled=os.path.join("output",listaDjvu[0].replace("_0000.djvu",".djvu"))
        comando="djvm -c %s %s" % (djvuBundled,lista)
        res=os.system(comando)
        print res,comando
        if n<len(listaDjvu):
                np=n+1
                for n in range(np,len(listaDjvu)):
                        comando="djvm -i %s %s" % (djvuBundled,os.path.join("djvu",listaDjvu[n]))
                        res=os.system(comando)
                        print res,comando
        return lista



def editXml(IAid):
        xmlFile=os.path.join("input",IAid)+"_djvu.xml"
        xml=open(xmlFile).read()
        url=find_stringa(xml,'OBJECT data="','"',0)
        urlNew=path2url(os.getcwd())+"/output/"+IAid+".djvu"
        xml=xml.replace(url,urlNew)
        open(xmlFile,"w").write(xml)
        print "File "+IAid+"_djvu.xml modificato"

def caricaTesto(IAid):
        editXml(IAid)
        # splits xml into header, list of obiects, footer
        # to build smaller temp xml files (50 pages blocks)
        # and to run them avoiding out of memory errors
        h,b,f=splitObject(IAid)
        for i in range(0,len(b),50):
                open("testo.xml","w").write(h+"\n".join(b[i:i+50])+f)
                print "scritto xml per pagine ",i," - ",i+50
                comando="djvuxmlparser testo.xml"# %s" % (os.path.join("input",IAid+"_djvu.xml"))
                print comando
                res=os.system(comando)
                print "risultato: ",res

# utilities 

def find_stringa(stringa,idi,idf,dc=0,x=None,side="left"):
    if side=="right":
        idip=stringa.rfind(idi)
    else:
        idip=stringa.find(idi)
    idfp=stringa.find(idf,idip+len(idi))+len(idf)
    if idip>-1 and idfp>0:
        if x!=None:
            while stringa[idip:idfp].count(x)>stringa[idip:idfp].count(idf):
                if stringa[idip:idfp].count(x)>stringa[idip:idfp].count(idf):
                    idfp=stringa.find(idf,idfp)+len(idf)

        if dc==0:
            vvalore=stringa[idip+len(idi):idfp-len(idf)]
        else:
            vvalore=stringa[idip:idfp]
    else:
        vvalore=""
    return vvalore

def produci_lista(testo,idi,idf,dc=1,inizio=None):
    t=testo[:]
    lista=[]
    while not find_stringa(t,idi,idf,1,inizio)=="":
        el=find_stringa(t,idi,idf,1,inizio)
        t=t.replace(el,"",1)
        if dc==0:
            el=find_stringa(el,idi,idf,0,inizio)
        lista.append(el)
    return lista

def carica_pcl(nome_file, folder="dati/"):
    nome_file=folder+nome_file+".pcl"
    f=open(nome_file)
    contenuto=pickle.load(f)
    f.close()
    return contenuto

def salva_pcl(variabile,nome_file="dato",folder="dati/"):
    nome_file=folder+nome_file+".pcl"
    f=open(nome_file,"w")
    pickle.dump(variabile, f)
    f.close()
    print "Variabile salvata nel file "+nome_file

def main(IAid, down=True, fileformat="jpg"):
        if down:
                downloadItem(IAid)
        dezip(IAid + "_jp2.zip")
        jp2tojpg(fileformat=fileformat)
        jpgtodjvu(fileformat=fileformat)
        merge()
        caricaTesto(IAid)


def splitObject(IAid):
    """
    Splitta djvu.xml in header, lista di object, footer.
    """
    xmlFile=os.path.join("input",IAid)+"_djvu.xml"
    xml=open(xmlFile).read()
    fs=xml.split("<OBJECT")
    for i in range(1,len(fs)):
        fs[i]="<OBJECT "+fs[i].strip()
    fs[len(fs)-1]=fs[len(fs)-1].replace("\n</BODY>\n</DjVuXML>","")
    footer="\n</BODY>\n</DjVuXML>"
    header=fs.pop(0)+"\n"
    return (header,fs,footer)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Crea un file DjVu a partire dall'Internet Archive.")

    parser.add_argument('id', help="identificatore dell'Internet Archive")

    parser.add_argument('--no-download', dest='download',
                        action='store_false', help='non scaricare il file')
    parser.add_argument('--pbm', dest='pbm',
                        action='store_true', help='usa PBM come formato intermedio (non compresso)')

    args = parser.parse_args()

    main(args.id, down=args.download, fileformat=("pbm" if args.pbm else "jpg"))


# passo 1: estrazione immagini
##def estrai(IA_identifier):
##	comando="erase temp\*.* /Q"
##	res=os.system(comando)
##	comando="pdfimages -f 1 -l 10 %s.pdf temp/%s" %(IA_identifier,IA_identifier)
##	res=os.system(comando)
##	print comando, res
##	comando="erase temp\*.ppm"
##	res=os.system(comando)
##	print comando,res
##	return
##	
##def inverti():
##        lista=os.listdir("temp")
##        for el in lista:
##                immagine=Image.open("temp/"+el)
##                immagine=immagine.convert("RGB")
##                immagine=ImageOps.invert(immagine)
##                immagine=immagine.convert("1")
##                immagine.save("temp/"+el)
##        return
##
##def cjb2():
##        n=0
##        for el in os.listdir("temp"):
##                pbm="temp/"+el
##                djvu="djvu/"+el[0:-8]+str(n).zfill(4)+".djvu"
##                comando="cjb2 %s %s" % (pbm,djvu)
##                res=os.system(comando)
##                print res,comando
##                if n==0:
##                        djvuBase=djvu
##                else:
##                        comando="djvm -i %s %s" % (djvuBase,djvu)
##                        res=os.system(comando)
##                        print "   ",res,comando
##                n+=1
##        return