#!/usr/bin/python
# -*- coding: utf-8 -*-
#import pywikibot as bot
import re
from internetarchive import *
import pickle
from os import remove
from shutil import copy
session=get_session()
def grab_item(id):
metadata={}
sourceItem=get_item(id)
for m in "title,creator,year,subject,description,source,language,licenseurl,mediatype".split(","):
if m in sourceItem.metadata:
metadata[m]=sourceItem.metadata[m]
return metadata
def uploadPdf(pdf):
## meta=u'''identifier:sinoalconfineitws
##title:Sino al confine
##creator:Grazia Deledda
##description:Novel by Grazia Deledda, Fratelli Treves Editori, Milano, 1910
##subject:itwikisource item,italian literature,novels
##date:1910
##language:ita
##licenseurl:http://creativecommons.org/publicdomain/mark/1.0/
##mediatype:texts
##collection:opensource'''
meta=unicode(open("upload.txt").read(),"utf-8")
metadati={}
meta=meta.split("\n")
for i in range(len(meta)):
m=meta[i].split(":",1)
metadati[m[0]]=m[1]
if m[0] in ["subject","creator","language"] and len(m[1].split(","))>1:
metadati[m[0]]=metadati[m[0]].split(",")
item=metadati["identifier"]
del metadati["identifier"]
if get_item(item).exists:
print "Item ",item, " exists"
return
else:
copy(pdf,item+".pdf")
print "L'item viene caricato"
upload(item,item+".pdf",metadati,verbose=True)
remove(item+".pdf")
return (item,metadati)
def metadata(item):
meta=u'''title:La via del male
creator:Grazia Deledda
description:Romanzo di Grazia Deledda, Nuova Antologia, Roma, 1906
subject:itwikisource item,italian literature,novels
date:1906
language:ita
licenseurl:http://creativecommons.org/publicdomain/mark/1.0/
mediatype:texts
collection:opensource'''
metadati={}
meta=meta.split("\n")
for i in range(len(meta)):
m=meta[i].split(":",1)
metadati[m[0]]=m[1]
if m[0] in ["subject","creator","language"] and len(m[1].split(","))>1:
metadati[m[0]]=metadati[m[0]].split(",")
if get_item(item).exists:
print "Modifico i metadati"
modify_metadata(item,metadati,verbose=True)
else:
print "L'item non esiste"
return
def uploadDjvu(id,djvu):
metadata=grab_item(id)
newId=id+"_djvu"
metadata["description"]+='<br>Derived from files into <a href="https://archive.org/details/%s">%s</a>' %(id,metadata["title"])
upload(newId,djvu,metadata,verbose=True)
return
def fixMeta():
t=carica_pcl("listaOpal","")
for i in t:
if t[4]=="xxxx":
place=raw_input(t[2]+": ")
pl=re.compile(r"%s: *([^,]+), (\d\d\d\d)" % (place))
for j in range(len(t)):
break
return
# restituisce la lista degli items trovati con la query
def search(query):
l=[]
s=search_items(query)
for i in s:
l.append(i["identifier"])
return l
# carica i metadati di uno o più item come lista di dizionari metadata
def grab_metadata(l):
if type(l)==str or type(l)==unicode:
l=[l]
for i in range(len(l)):
item=get_item(l[i])
l[i]=item.metadata
return l
# restituisce i metadati degli items ottenuti con una query
def search_metadata(query):
l=search(query)
lm=grab_metadata(l)
return lm
# riceve un dizionario metadata e lo trastorma in testo editabile utf-8
def dict2txt(metadata):
testo=[]
for m in ["identifier","title","description","subject","publisher","city","year"]:
if m in metadata:
testo.append(m+":"+(metadata[m] if type(metadata[m])!=list else ",".join(metadata[m])))
for m in metadata:
if not m in ["identifier","title","description","subject","publisher","city",\
"year","addeddate","scanner","curation","ocr","publicdate","uploader","ppi","mediatype"]:
testo.append(m+":"+(metadata[m] if type(metadata[m])!=list else ",".join(metadata[m])))
return "\n".join(testo).encode("utf-8")
#riceve un testo utf-8 o unicode tipo dict2txt e lo trasforma in un dizionario tipo metadata
def txt2dict(testo):
metadata={}
try:
if not type(testo)==unicode:
testo=unicode(testo,"utf-8")
except:
print "Il testo deve essere utf-8 o unicode"
return False
testo=testo.strip().split("\n")
for i in range(len(testo)):
testo[i]=testo[i].split(":",1)
if len(testo[i])!=2:
print "Ogni riga deve avere un carattere : "
return False
testo[i][0]=testo[i][0].strip()
testo[i][1]=testo[i][1].strip()
if testo[0] != "description":
if "," in testo[i][1]:
testo[i][1]=testo[i][1].split(",")
for j in range(len(testo[i][1])):
testo[i][1][j]=testo[i][1][j].strip()
metadata[testo[i][0]]=testo[i][1]
return metadata
# utilities
# Nuova versione, gestisce i tag annidati; x e' la parte "aspecifica" del
# tag di apertura (es: {{ cercando {{Intestazione| )
def find_stringa(stringa,idi,idf,dc=0,x=None,side="left"):
if side=="right":
idip=stringa.rfind(idi)
else:
idip=stringa.find(idi)
idfp=stringa.find(idf,idip+len(idi))+len(idf)
if idip>-1 and idfp>0:
if x!=None:
while stringa[idip:idfp].count(x)>stringa[idip:idfp].count(idf):
if stringa[idip:idfp].count(x)>stringa[idip:idfp].count(idf):
idfp=stringa.find(idf,idfp)+len(idf)
if dc==0:
vvalore=stringa[idip+len(idi):idfp-len(idf)]
else:
vvalore=stringa[idip:idfp]
else:
vvalore=""
return vvalore
def produci_lista(testo,idi,idf,dc=1,inizio=None):
t=testo[:]
lista=[]
while not find_stringa(t,idi,idf,1,inizio)=="":
el=find_stringa(t,idi,idf,1,inizio)
t=t.replace(el,"",1)
if dc==0:
el=find_stringa(el,idi,idf,0,inizio)
lista.append(el)
return lista
def carica_pcl(nome_file, folder="dati/"):
nome_file=folder+nome_file+".pcl"
f=open(nome_file)
contenuto=pickle.load(f)
f.close()
return contenuto
def salva_pcl(variabile,nome_file="dato",folder="dati/"):
nome_file=folder+nome_file+".pcl"
f=open(nome_file,"w")
pickle.dump(variabile, f)
f.close()
print "Variabile salvata nel file "+nome_file
return
'{}'