Commit ee5f4a3e authored by Atrax Nicolas's avatar Atrax Nicolas

Update pages

parent ff89fe15
......@@ -4,7 +4,7 @@ name = "Home"
icon = ":house:"
[[pages]]
name = "API"
name = "API Tools"
icon = ":globe_with_meridians:"
is_section = true
......@@ -21,7 +21,15 @@ path = "pages/Zotero_To_GarganText.py"
name = "Zotero To GarganText"
[[pages]]
name = "Convert"
path = "pages/TSV_Translator.py"
name = "TSV Translator"
[[pages]]
path = "pages/YTB_to_TSV.py"
name = "YTB To TSV"
[[pages]]
name = "Convert Tools"
icon = ":twisted_rightwards_arrows:"
is_section = true
......@@ -70,12 +78,12 @@ name = "PDF To TXT"
path = "pages/TXT_to_TSV.py"
name = "TXT To TSV"
[[pages]]
name = "Other Tools"
icon = ":twisted_rightwards_arrows:"
is_section = true
[[pages]]
path = "pages/Merge_Term_GarganText.py"
name = "Merge Term GarganText"
in_section = false
name = "Merge GarganText Terms"
[[pages]]
path = "pages/TSV_Translator.py"
name = "TSV Translator"
locale,key,value
fr,title,"**HAL vers GarganText**"
en,title,"**HAL To GarganText**"
fr,title,"# HAL vers GarganText"
en,title,"# HAL To GarganText"
fr,text,"HAL est une base de document scientifique en ligne et libre d'accès contenant plus d'un million de document."
en,text,"HAL is an online and free access scientific document database containing more than a million documents"
......
locale,key,value
fr,title,"**Isidore vers GarganText**"
en,title,"**Isidore To GarganText**"
fr,title,"# Isidore vers GarganText"
en,title,"# Isidore To GarganText"
fr,keyword,"Mots clés"
en,keyword,"Key word"
......
locale,key,value
fr,title,"Fusionne Deux Liste de Terme de GarganText"
en,title,"Input Two Term File From GarganText"
fr,file,"Choisir un fichier"
en,file,"Choose a file"
fr,title,"# Merge GarganText Terms"
en,title,"# Merge GarganText Terms"
fr,new_file,"Télécharge ton fichier fusionner:"
en,new_file,"Download your merge file:"
\ No newline at end of file
fr,text,"Fusionne 2 fichiers de termes de GarganText."
en,text,"Input 2 term files from GarganText."
fr,file," Choisir un fichier "
en,file," Choose a file "
fr,new_file," Télécharge ton fichier fusionné "
en,new_file," Download your merge file "
\ No newline at end of file
......@@ -33,7 +33,7 @@ fr,globalWarning, "Attention ! Plusieurs langues ont été détectées entre vos
en,globalWarning,"Warning ! Multiple languages have been detected for your pdfs file !\nThe following languages have been detected : "
fr,advice,"Cela pourrait affecter massivement l'analyse de GarganText.\nVous pouvez régler ça en traduisant avec l'outil TsvTranslator."
en,advice,"This could massively affect the analysis of Gargantext.\nYou can correct this by translation with the TsvTranslator tool."
fr,advice,"Cela pourrait affecter massivement l'analyse de GarganText.Vous pouvez régler ça en traduisant avec l'outil TsvTranslator."
en,advice,"This could massively affect the analysis of Gargantext.You can correct this by translation with the TsvTranslator tool."
......@@ -25,3 +25,15 @@ en,translate2," To "
fr,detected,"Langues détectées : "
en,detected,"Detected languages : "
fr,loading,"Progression de la traduction : "
en,loading,"Translation progress : "
fr,loadingLanguages," Analyse des langues du fichier "
en,loadingLanguages," File languages analysis "
fr,sameLanguages,"Une seule langue a été détectée au sein du fichier : "
en,sameLanguages,"Only one language has been detected inside this file : "
fr,anotherFile," Traduire un autre fichier "
en,anotherFile," Translate another file "
locale,key,value
fr,title,"# Bienvenue sur GanganText Tools"
en,title,"# Welcome to GanganText Tools"
fr,title,"# Bienvenue sur GarganTools"
en,title,"# Welcome to GarganTools"
fr,welcome,"Bienvenue sur ces pages rassemblant des outils développés par des utilisateurs de GarganText pour des utilisateurs de GarganText."
en,welcome,"Welcome to these pages featuring tools developed by GarganText’ users for GarganText’ users."
......
locale,key,value
fr,title,"# YTB To TSV"
en,title,"# YTB To TSV"
fr,title,"# Youtube To TSV"
en,title,"# Youtube To TSV"
fr,text,"Inspecte un fichier CSV pour vérifier s'il est compatible avec Gargantext"
en,text,"Inspect a CSV file to check if it is compatible with GarganText"
fr,text,"Effectue une recherche Youtube à l'aide de mots clés (thème, titre de vidéo, lien de vidéo,...) pour créer un fichier TSV à partir des sous-titres de vidéos."
en,text,"Do a Youtube research with keywords (topic, video title, video link,...) to create a TSV file based on the subtitles of the videos."
fr,file,"Choisir un fichier"
en,file,"Choose a file"
......@@ -20,5 +20,14 @@ en,fill,"Only manual subtitles (longer waiting time)"
fr,submit," Soumettre "
en,submit," Submit "
fr,loadingID," Recherche de vidéos "
en,loadingID," Searching videos "
fr,loading,"Traitement des vidéos : "
en,loading,"Videos processing : "
fr,quantity," sur "
en,quantity," out of "
fr,new_file,"Télécharge ton fichier TSV :"
en,new_file,"Download your TSV file :"
locale,key,value
fr,title,"**Zotero vers GarganText**"
en,title,"**Zotero vers GarganText**"
fr,title,"# Zotero vers GarganText"
en,title,"# Zotero vers GarganText"
fr,data,"Type de donnée"
en,data,"Type of data"
......
......@@ -245,11 +245,10 @@ def getContent(file, separator, data, success, fill, errorMessage):
# Code End
st.write(st.session_state.general_text_dict['title'])
st.write(st.session_state.general_text_dict['text'])
st.session_state.fill = st.checkbox(st.session_state.general_text_dict['fill'])
file = st.file_uploader(
st.session_state.general_text_dict['file'], type=["csv"], key='file')
st.session_state.general_text_dict['file'], type=["tsv", "csv"], key='file')
if file:
name = file.name.split('.')[0] + '.tsv'
......
......@@ -13,8 +13,11 @@ tmp.base("HALToGarganText")
limit = 500
limitItems = 10000
def loadApiHALNbFile(search, lang):
url = 'http://api.archives-ouvertes.fr/search/?q=' + search + '&rows=5&fl=title_s,' + lang + '_title_s,source_s,publicationDate_s,authFullName_s,' + lang + '_abstract_s,abstract_s&fq=language_s:' + lang
url = 'http://api.archives-ouvertes.fr/search/?q=' + search + '&rows=5&fl=title_s,' + lang + \
'_title_s,source_s,publicationDate_s,authFullName_s,' + \
lang + '_abstract_s,abstract_s&fq=language_s:' + lang
resp = req.get(url)
print(url)
try:
......@@ -26,7 +29,8 @@ def loadApiHALNbFile(search, lang):
def loadApiHAL(search, lang, page, nbvalue):
url = 'http://api.archives-ouvertes.fr/search/?q=' + search + '&start=' + str(page * limit) + '&rows=' + str(nbvalue) + '&fl=title_s,' + lang + '_title_s,source_s,publicationDate_s,authFullName_s,' + lang + '_abstract_s,abstract_s&fq=language_s:' + lang
url = 'http://api.archives-ouvertes.fr/search/?q=' + search + '&start=' + str(page * limit) + '&rows=' + str(
nbvalue) + '&fl=title_s,' + lang + '_title_s,source_s,publicationDate_s,authFullName_s,' + lang + '_abstract_s,abstract_s&fq=language_s:' + lang
resp = req.get(url)
print(url)
try:
......@@ -81,41 +85,50 @@ def getParamFromDoc(docs):
if 'publicationDate_s' in doc.keys():
split = doc['publicationDate_s'].split('-')
if len(split) == 3:
pdate = datetime.strptime(doc['publicationDate_s'], '%Y-%m-%d').strftime('%Y\t%m\t%d')
pdate = datetime.strptime(
doc['publicationDate_s'], '%Y-%m-%d').strftime('%Y\t%m\t%d')
elif len(split) == 2:
pdate = datetime.strptime(doc['publicationDate_s'], '%Y-%m').strftime('%Y\t%m\t1')
pdate = datetime.strptime(
doc['publicationDate_s'], '%Y-%m').strftime('%Y\t%m\t1')
else:
pdate = doc['publicationDate_s'] + '\t1\t1'
else:
pdate = '1900\t1\t1'
abstract = abstract.encode(encoding = 'UTF-8', errors = 'ignore').decode("utf-8").replace('\t', '').replace('"', '').replace('\n', '')
title = title.encode(encoding = 'UTF-8', errors = 'ignore').decode("utf-8").replace('\t', '').replace('"', '').replace('\n', '')
source = source.encode(encoding = 'UTF-8', errors = 'ignore').decode("utf-8").replace('\t', '').replace('"', '').replace('\n', '').replace('\n', '')
abstract = abstract.encode(encoding='UTF-8', errors='ignore').decode(
"utf-8").replace('\t', '').replace('"', '').replace('\n', '')
title = title.encode(encoding='UTF-8', errors='ignore').decode(
"utf-8").replace('\t', '').replace('"', '').replace('\n', '')
source = source.encode(encoding='UTF-8', errors='ignore').decode(
"utf-8").replace('\t', '').replace('"', '').replace('\n', '').replace('\n', '')
# Output
output += str(title) + "\t" + source + "\t" + str(pdate) + "\t" + abstract + "\t" + authors + "\t" + str(1) + "\n"
output += str(title) + "\t" + source + "\t" + str(pdate) + \
"\t" + abstract + "\t" + authors + "\t" + str(1) + "\n"
return output
def create_output(search, lang, nb_value):
output = "title\tsource\tpublication_year\tpublication_month\tpublication_day\tabstract\tauthors\tweight\n"
for i in range(0, nb_value//limit):
response = loadApiHAL(search, lang, i, limit)
output += getParamFromDoc(response['response']['docs'])
if (nb_value%limit != 0):
response = loadApiHAL(search, lang, nb_value//limit, nb_value%limit)
if (nb_value % limit != 0):
response = loadApiHAL(search, lang, nb_value//limit, nb_value % limit)
output += getParamFromDoc(response['response']['docs'])
return output
lang = {
'Français' : 'fr',
'Anglais' : 'en',
'Français': 'fr',
'Anglais': 'en',
}
if 'stage_isidore' not in st.session_state:
st.session_state.stage_isidore = 0
st.session_state.nb_wanted = 1
def set_stage(stage):
st.session_state.stage_isidore = stage
st.session_state.output = ''
......@@ -127,19 +140,20 @@ st.write(st.session_state.general_text_dict['text'])
# Form
form = st.form('api')
form.write(st.session_state.general_text_dict['title'])
search = form.text_input(st.session_state.general_text_dict['keyword'])
language = form.selectbox(st.session_state.general_text_dict['lang'], lang.keys())
language = form.selectbox(
st.session_state.general_text_dict['lang'], lang.keys())
form.form_submit_button(st.session_state.general_text_dict['submit'], on_click=set_stage, args=(1,))
form.form_submit_button(
st.session_state.general_text_dict['submit'], on_click=set_stage, args=(1,))
# API and Slider
if st.session_state.stage_isidore > 0:
# Only call first time and after
if 'search' not in st.session_state or 'language' not in st.session_state or search != st.session_state.search or language != st.session_state.language:
with st.spinner(st.session_state.general_text_dict['load_api'] ):
with st.spinner(st.session_state.general_text_dict['load_api']):
nb_doc = int(loadApiHALNbFile(search, lang[language]))
st.session_state.nb_doc = nb_doc
if nb_doc != 0:
......@@ -150,27 +164,31 @@ if st.session_state.stage_isidore > 0:
# Form with slider
form2 = st.form('my_form2')
form2.write(st.session_state.general_text_dict['nb_doc'] + str(st.session_state.nb_doc))
form2.write(
st.session_state.general_text_dict['nb_doc'] + str(st.session_state.nb_doc))
if st.session_state.nb_doc > limitItems:
form2.write(st.session_state.general_text_dict['perform1'] + str(limitItems) + st.session_state.general_text_dict['perform2'])
st.session_state.nb_wanted = form2.slider(st.session_state.general_text_dict['nb_taken'], 1, limitItems)
form2.write(st.session_state.general_text_dict['perform1'] + str(
limitItems) + st.session_state.general_text_dict['perform2'])
st.session_state.nb_wanted = form2.slider(
st.session_state.general_text_dict['nb_taken'], 1, limitItems)
else:
st.session_state.nb_wanted = form2.slider(st.session_state.general_text_dict['nb_taken'], 1, int(st.session_state.nb_doc))
form2.form_submit_button(st.session_state.general_text_dict['submit'], on_click=set_stage, args=(2,))
st.session_state.nb_wanted = form2.slider(
st.session_state.general_text_dict['nb_taken'], 1, int(st.session_state.nb_doc))
form2.form_submit_button(
st.session_state.general_text_dict['submit'], on_click=set_stage, args=(2,))
else:
st.write(st.session_state.general_text_dict['overload_api'] )
st.write(st.session_state.general_text_dict['overload_api'])
# Download
if st.session_state.stage_isidore > 1:
with st.spinner(st.session_state.general_text_dict['createTSV'] ):
with st.spinner(st.session_state.general_text_dict['createTSV']):
if st.session_state.output == '':
print(st.session_state.nb_wanted)
st.session_state.output = create_output(st.session_state.search, lang[st.session_state.language], st.session_state.nb_wanted)
st.session_state.output = create_output(
st.session_state.search, lang[st.session_state.language], st.session_state.nb_wanted)
st.download_button('Download TSV', st.session_state.output, 'output.csv')
......@@ -17,9 +17,9 @@ limitItems = 5000 # Can't be superior of 10 times numberReplies
retryTime = 2
def loadApiIsidoreNumberFile(search, language):
url = 'https://api.isidore.science/resource/search?q=' + search + '&output=json&replies=10&language=http://lexvo.org/id/iso639-3/' + language
url = 'https://api.isidore.science/resource/search?q=' + search + \
'&output=json&replies=10&language=http://lexvo.org/id/iso639-3/' + language
resp = req.get(url)
print(url)
if resp.ok:
......@@ -30,8 +30,11 @@ def loadApiIsidoreNumberFile(search, language):
return docs
def loadApiIsidorePage(search, language, page):
url = 'https://api.isidore.science/resource/search?q=' + search + '&output=json&replies=' + str(numberReplies) + '&page=' + str(page) + '&language=http://lexvo.org/id/iso639-3/' + language
url = 'https://api.isidore.science/resource/search?q=' + search + '&output=json&replies=' + \
str(numberReplies) + '&page=' + str(page) + \
'&language=http://lexvo.org/id/iso639-3/' + language
resp = req.get(url)
print(url)
try:
......@@ -47,22 +50,23 @@ def create_output(search, language, nb_doc):
output = "title\tsource\tpublication_year\tpublication_month\tpublication_day\tabstract\tauthors\tweight\n"
nb = 0
for i in range(1, nb_doc//numberReplies + 1):
while(True):
while (True):
txt = loadApiIsidorePage(search, language, i)
if txt != 0:
break
time.sleep(retryTime)
print('Retry')
tmp, nb_tmp = createFile(txt, nb_doc%numberReplies, language)
tmp, nb_tmp = createFile(txt, nb_doc % numberReplies, language)
output += tmp
nb += nb_tmp
if nb_doc%numberReplies != 0:
if nb_doc % numberReplies != 0:
txt = loadApiIsidorePage(search, language, nb_doc//numberReplies + 1)
tmp, nb_tmp = createFile(txt, nb_doc%numberReplies, language)
tmp, nb_tmp = createFile(txt, nb_doc % numberReplies, language)
output += tmp
nb += nb_tmp
return output, nb
def createFile(docs, limit, language):
# Output text
......@@ -72,11 +76,11 @@ def createFile(docs, limit, language):
for doc in docs:
if (i == limit):
break
i+=1
i += 1
# Title
title = doc["isidore"]["title"]
if (type(title) != str):
if(type(title) == list):
if (type(title) == list):
tmp = ''
for lang in title:
if type(lang) != str and lang['@xml:lang'] == language[:2]:
......@@ -93,22 +97,22 @@ def createFile(docs, limit, language):
title = title['$']
# Source
source =doc["isidore"]["source_info"]["sourceName"]["$"]
source = doc["isidore"]["source_info"]["sourceName"]["$"]
# Author
if doc['isidore']['enrichedCreators'] != []:
list_author = doc["isidore"]["enrichedCreators"]["creator"]
authors = []
if(type(list_author) == list):
if (type(list_author) == list):
for author in list_author:
authors.append(author["@origin"].replace('"',''))
authors.append(author["@origin"].replace('"', ''))
authors = ';'.join(authors)
else:
authors = list_author["@origin"].replace('"','')
authors = list_author["@origin"].replace('"', '')
else:
authors = ''
#Abstract
# Abstract
if 'abstract' in doc['isidore'].keys() and doc["isidore"]["abstract"] != []:
abstract = doc["isidore"]["abstract"]
else:
......@@ -126,7 +130,7 @@ def createFile(docs, limit, language):
abstract = abstract[0]['$']
else:
abstract = tmp
else :
else:
abstract = abstract['$']
if 'types' in doc['isidore'].keys():
......@@ -134,24 +138,27 @@ def createFile(docs, limit, language):
nb += 1
elif type(doc['isidore']['types']['type'] == dict) and doc['isidore']['types']['type'][1] in ['Books', 'text']:
nb += 1
else :
else:
print(title)
# Publication Date
pdate = getGoodTime(doc["isidore"]["date"]["@origin"])
abstract = abstract.encode(encoding = 'UTF-8', errors = 'ignore').decode("utf-8").replace('\t', '').replace('"', '')
title = title.encode(encoding = 'UTF-8', errors = 'ignore').decode("utf-8").replace('\t', '').replace('"', '')
source = source.encode(encoding = 'UTF-8', errors = 'ignore').decode("utf-8").replace('\t', '').replace('"', '').replace('\n', '')
abstract = abstract.encode(
encoding='UTF-8', errors='ignore').decode("utf-8").replace('\t', '').replace('"', '')
title = title.encode(
encoding='UTF-8', errors='ignore').decode("utf-8").replace('\t', '').replace('"', '')
source = source.encode(encoding='UTF-8', errors='ignore').decode(
"utf-8").replace('\t', '').replace('"', '').replace('\n', '')
# Output
row = str(title) + "\t" + source + "\t" + str(pdate) + "\t" + abstract + "\t" + authors + "\t" + str(1) + "\n"
row = str(title) + "\t" + source + "\t" + str(pdate) + \
"\t" + abstract + "\t" + authors + "\t" + str(1) + "\n"
output += row
return output, nb
# Need modification when Isidore upgrade his api
def getGoodTime(time):
time = time.replace('?', '0').replace('.', '0')
......@@ -160,43 +167,44 @@ def getGoodTime(time):
if (len(tiret) == 1 and len(tiret[0]) == 4):
if time != '0001':
return tiret[0] + '\t1\t1'
elif (len(tiret)>1):
if (len(tiret) == 2 and len(tiret[0])==4 and len(tiret[1]) <= 2):
elif (len(tiret) > 1):
if (len(tiret) == 2 and len(tiret[0]) == 4 and len(tiret[1]) <= 2):
return tiret[0] + '\t' + tiret[1] + '\t1'
elif (len(tiret) == 3 and len(tiret[0])==4 and len(tiret[1]) <= 2 and len(tiret[2].split('T')[0]) <= 2):
elif (len(tiret) == 3 and len(tiret[0]) == 4 and len(tiret[1]) <= 2 and len(tiret[2].split('T')[0]) <= 2):
return tiret[0] + '\t' + tiret[1] + '\t' + tiret[2].split('T')[0]
elif(len(slash)==1 and len(time) == 4):
elif (len(slash) == 1 and len(time) == 4):
return slash[0] + '\t1\t1'
elif (len(slash) > 1):
if (len(slash) == 2 and len(slash[1])==4 and len(slash[0]) <= 2):
if (len(slash) == 2 and len(slash[1]) == 4 and len(slash[0]) <= 2):
return slash[1] + '\t' + slash[0] + '\t1'
elif (len(slash) == 3 and len(slash[2])==4 and len(slash[1]) <= 2 and len(slash[0]) <= 2):
elif (len(slash) == 3 and len(slash[2]) == 4 and len(slash[1]) <= 2 and len(slash[0]) <= 2):
return slash[2] + '\t' + slash[1] + '\t' + slash[0].split('T')[0]
elif len(time) == 8:
return time[:4] + '\t' + time[4:6] + '\t' + time[6:8]
return '1900\t1\t1'
###Streamlit
# Streamlit
lang = {
'Français' : 'fra',
'Anglais' : 'eng',
'Espagnol' : 'spa',
'Italien' : 'ita',
'Allemand' : 'deu',
'Français': 'fra',
'Anglais': 'eng',
'Espagnol': 'spa',
'Italien': 'ita',
'Allemand': 'deu',
}
# Can be added but low result
#'Polonais' : 'nld',
#'Portugais' : 'por',
#'Russe' : 'rus'
# 'Polonais' : 'nld',
# 'Portugais' : 'por',
# 'Russe' : 'rus'
if 'stage_isidore' not in st.session_state:
st.session_state.stage_isidore = 0
st.session_state.nb_wanted = 1
def set_stage(stage):
st.session_state.stage_isidore = stage
st.session_state.output = ''
......@@ -212,19 +220,20 @@ with col2:
# Form
form = st.form('api')
form.write(st.session_state.general_text_dict['title'])
search = form.text_input(st.session_state.general_text_dict['keyword'])
language = form.selectbox(st.session_state.general_text_dict['lang'], lang.keys())
language = form.selectbox(
st.session_state.general_text_dict['lang'], lang.keys())
form.form_submit_button(st.session_state.general_text_dict['submit'], on_click=set_stage, args=(1,))
form.form_submit_button(
st.session_state.general_text_dict['submit'], on_click=set_stage, args=(1,))
# API and Slider
if st.session_state.stage_isidore > 0:
# Only call first time and after
if 'search' not in st.session_state or 'language' not in st.session_state or search != st.session_state.search or language != st.session_state.language:
with st.spinner(st.session_state.general_text_dict['load_api'] ):
with st.spinner(st.session_state.general_text_dict['load_api']):
nb_doc = int(loadApiIsidoreNumberFile(search, lang[language]))
st.session_state.nb_doc = nb_doc
if nb_doc != 0:
......@@ -235,28 +244,32 @@ if st.session_state.stage_isidore > 0:
# Form with slider
form2 = st.form('my_form2')
form2.write(st.session_state.general_text_dict['nb_doc'] + str(st.session_state.nb_doc))
form2.write(
st.session_state.general_text_dict['nb_doc'] + str(st.session_state.nb_doc))
if st.session_state.nb_doc > limitItems:
form2.write(st.session_state.general_text_dict['perform1'] + str(limitItems) + st.session_state.general_text_dict['perform2'])
st.session_state.nb_wanted = form2.slider(st.session_state.general_text_dict['nb_taken'], 1, limitItems)
form2.write(st.session_state.general_text_dict['perform1'] + str(
limitItems) + st.session_state.general_text_dict['perform2'])
st.session_state.nb_wanted = form2.slider(
st.session_state.general_text_dict['nb_taken'], 1, limitItems)
else:
st.session_state.nb_wanted = form2.slider(st.session_state.general_text_dict['nb_taken'], 1, int(st.session_state.nb_doc))
form2.form_submit_button(st.session_state.general_text_dict['submit'], on_click=set_stage, args=(2,))
st.session_state.nb_wanted = form2.slider(
st.session_state.general_text_dict['nb_taken'], 1, int(st.session_state.nb_doc))
form2.form_submit_button(
st.session_state.general_text_dict['submit'], on_click=set_stage, args=(2,))
else:
st.write(st.session_state.general_text_dict['overload_api'] )
st.write(st.session_state.general_text_dict['overload_api'])
# Download
if st.session_state.stage_isidore > 1:
with st.spinner(st.session_state.general_text_dict['createTSV'] ):
with st.spinner(st.session_state.general_text_dict['createTSV']):
if st.session_state.output == '':
print(st.session_state.nb_wanted)
st.session_state.output, st.session_state.nb_bad_file = create_output(st.session_state.search, lang[st.session_state.language], st.session_state.nb_wanted)
st.session_state.output, st.session_state.nb_bad_file = create_output(
st.session_state.search, lang[st.session_state.language], st.session_state.nb_wanted)
st.write(st.session_state.general_text_dict['doc_abstract1'] + str(st.session_state.nb_bad_file) + st.session_state.general_text_dict['doc_abstract2'])
st.write(st.session_state.general_text_dict['doc_abstract1'] + str(
st.session_state.nb_bad_file) + st.session_state.general_text_dict['doc_abstract2'])
st.download_button('Download TSV', st.session_state.output, 'output.csv')
......@@ -21,73 +21,71 @@ def tmp(file1, file2):
roots = []
leafs = []
# find the roots of list 1
for root in ngrams1.keys() :
for root in ngrams1.keys():
if ngrams1[root]['list'] == "MapTerm" :
if ngrams1[root]['list'] == "MapTerm":
roots.append(root)
leafs = leafs + ngrams1[root]['children']
# merge list 2 in list 1
for root in ngrams2.keys() :
for root in ngrams2.keys():
if root in roots :
if root in roots:
ngrams1[root]['children'] = list(set(ngrams1[root]['children'] + ngrams2[root]['children']))
ngrams1[root]['children'] = list(
set(ngrams1[root]['children'] + ngrams2[root]['children']))
else :
else:
if root not in leafs :
if root not in leafs:
ngrams1[root] = ngrams2[root]
children = ngrams1[root]['children']
ngrams1[root]['children'] = []
for child in children :
if child not in root or child not in leafs :
for child in children:
if child not in root or child not in leafs:
ngrams1[root]['children'].append(child)
# clean the merged list
for root in ngrams1.keys() :
for root in ngrams1.keys():
if ngrams1[root]['list'] == "MapTerm" :
if ngrams1[root]['list'] == "MapTerm":
if len(ngrams1[root]['children']) == 0 :
if len(ngrams1[root]['children']) == 0:
ngrams1[root]['children'] = []
merged[root] = ngrams1[root]
file1['NgramsTerms']['data'] = merged
file1['Authors']['data'] = {}
file1['Institutes']['data'] = {}
file1['Sources']['data'] = {}
tmp = file1.to_json(orient='columns',indent=4)
tmp = file1.to_json(orient='columns', indent=4)
return tmp
st.subheader(st.session_state.general_text_dict['title'])
st.write(st.session_state.general_text_dict['text'])
col1, col2 = st.columns(2)
with col1:
file1 = st.file_uploader(st.session_state.general_text_dict['file'],type=["json"],key='file1')
file1 = st.file_uploader(st.session_state.general_text_dict['file'], type=[
"json"], key='file1')
with col2:
file2 = st.file_uploader(st.session_state.general_text_dict['file'],type=["json"],key='file2')
file2 = st.file_uploader(st.session_state.general_text_dict['file'], type=[
"json"], key='file2')
if (file1 and file2):
try:
df1 = pd.read_json(file1)
df2 = pd.read_json(file2)
st.write(st.session_state.general_text_dict['new_file'])
time = datetime.strftime(datetime.now(),"%d-%m-%Y/%H:%M:%S")
name = 'output-' +time+ '.json'
st.download_button('Download File', tmp(df1, df2), name)
time = datetime.strftime(datetime.now(), "%d-%m-%Y/%H:%M:%S")
name = 'output-' + time + '.json'
st.download_button(
st.session_state.general_text_dict['new_file'], tmp(df1, df2), name)
except Exception as e:
st.write("Error : one of the file isn't valid")
......
......@@ -97,9 +97,9 @@ def detectMultiplePdfLanguages():
languages = []
for l in st.session_state.pdfLanguages.values():
if l not in languages and len(languages) == 1:
st.error(st.session_state.general_text_dict['globalWarning'])
st.error(str(st.session_state.pdfLanguages))
st.error(st.session_state.general_text_dict['advice'])
st.info(st.session_state.general_text_dict['globalWarning'])
st.info(str(st.session_state.pdfLanguages))
st.info(st.session_state.general_text_dict['advice'])
return
if len(languages) == 0:
languages.append(l)
......@@ -290,7 +290,7 @@ if st.session_state.page == 3:
"/PDFCompilation", 'zip', st.session_state.pdfDir.name)
with open(st.session_state.zipDir.name + "/PDFCompilation.zip", 'rb') as zip:
if st.session_state.warning != "":
st.error(st.session_state.warning)
st.info(st.session_state.warning)
detectMultiplePdfLanguages()
st.write(st.session_state.general_text_dict['new_file'])
st.download_button("PDFCompilation.zip",
......
......@@ -114,12 +114,14 @@ def getContent(file, data, total, separator):
reader = csv.DictReader(codecs.iterdecode(
file, 'utf-8'), delimiter=separator)
count = 1
bar = st.progress(0, "Translation progress : 0%")
bar = st.progress(0,
st.session_state.general_text_dict['loading'] + "0%")
for row in reader:
tmp = ""
first = True
loading = int(count / total * 100)
bar.progress(loading, "Translation progress : " + str(loading) + "%")
bar.progress(loading,
st.session_state.general_text_dict['loading'] + str(loading) + "%")
for name, value in row.items():
if not first:
tmp += "\t"
......@@ -207,9 +209,15 @@ def askTranslateLanguages(file):
if st.session_state.page == 0:
if st.session_state.detect:
if st.session_state.file != None:
st.session_state.separator = getSeparator(st.session_state.file)
with st.spinner(st.session_state.general_text_dict['loadingLanguages']):
st.session_state.separator = getSeparator(
st.session_state.file)
st.session_state.languages = inspectLanguages(
st.session_state.file)
if len(st.session_state.languages) == 1:
st.session_state.page = 3
st.session_state.detect = False
else:
st.session_state.page = 1
st.session_state.detect = False
st.session_state.tmpFile = st.session_state.file
......@@ -220,7 +228,7 @@ if st.session_state.page == 0:
if st.session_state.page == 1:
if st.session_state.submit:
if st.session_state.submit and st.session_state.srcLang != st.session_state.destLang:
st.session_state.page = 2
st.session_state.submit = False
else:
......@@ -232,3 +240,10 @@ if st.session_state.page == 2:
name = st.session_state.tmpFile.name
st.download_button(name,
tsv, name, on_click=resetPage())
if st.session_state.page == 3:
st.write(
st.session_state.general_text_dict['sameLanguages'] + list(st.session_state.languages.keys())[0])
st.session_state.languages = {}
st.button(
st.session_state.general_text_dict['anotherFile'], on_click=resetPage())
......@@ -197,5 +197,5 @@ if st.session_state.page == 1:
st.write(st.session_state.general_text_dict['new_file'])
st.session_state.submit = False
if st.session_state.warning != "":
st.error(st.session_state.warning)
st.info(st.session_state.warning)
st.download_button(name, txt, name, on_click=setPage())
......@@ -28,7 +28,6 @@ def ytbSearch(search, n):
result = videosSearch.result()["result"]
videos = []
while len(videos) < n:
print(len(videos))
for video in result:
id = video["id"]
title = video["title"]
......@@ -168,17 +167,19 @@ def correctTranscript(transcript):
def transcriptToTsv(search, nbVideos):
tsv = "authors\tsource\tpublication_year\tpublication_month\tpublication_day\ttitle\tabstract\n"
dict = st.session_state.general_text_dict
with st.spinner(dict['loadingID']):
if st.session_state.manualOnly:
videos = ytbSearch(search, nbVideos * 20)
else:
videos = ytbSearch(search, nbVideos * 4)
count = 0
countManual = 0
bar = st.progress(count / nbVideos, "Search videos : " +
str(count) + " out of " + str(nbVideos))
bar = st.progress(count / nbVideos, dict['loading'] +
str(count) + dict['quantity'] + str(nbVideos))
for video in videos:
bar.progress(count / nbVideos, "Search videos : " +
str(count) + " out of " + str(nbVideos))
bar.progress(count / nbVideos, dict['loading'] +
str(count) + dict['quantity'] + str(nbVideos))
if count == nbVideos:
break
id, author, title = video[0], video[1], video[2]
......@@ -237,6 +238,7 @@ def askVideos():
if st.session_state.page == 0:
st.write(st.session_state.general_text_dict['text'])
if st.session_state.submit:
st.session_state.submit = False
if st.session_state.keywords != "":
......
......@@ -29,9 +29,9 @@ def getAllItems(id):
return 0
def loadApiItems(id, page):
url = 'https://api.zotero.org/users/'+ str(id) +'/items/top?limit=' + str(limit) + '&start=' + str(page * limit) + '&direction=asc&sort=title'
url = 'https://api.zotero.org/users/' + str(id) + '/items/top?limit=' + str(
limit) + '&start=' + str(page * limit) + '&direction=asc&sort=title'
resp = req.get(url)
if resp.ok:
jsontxt = json.loads(resp.content)
......@@ -40,10 +40,9 @@ def loadApiItems(id, page):
return 0, 0
def loadApiCollections(id):
url = 'https://api.zotero.org/users/'+ str(id) +'/collections'
url = 'https://api.zotero.org/users/' + str(id) + '/collections'
resp = req.get(url)
if resp.ok:
jsontxt = json.loads(resp.content)
......@@ -51,9 +50,11 @@ def loadApiCollections(id):
else:
return 0
def loadApiItemsByCollections(id, key):
url = 'https://api.zotero.org/users/'+ str(id) +'/collections/' + str(key) + '/items/top'
url = 'https://api.zotero.org/users/' + \
str(id) + '/collections/' + str(key) + '/items/top'
resp = req.get(url)
if resp.ok:
jsontxt = json.loads(resp.content)
......@@ -62,7 +63,6 @@ def loadApiItemsByCollections(id, key):
return 0
def getAllCollections(docs):
output = {}
for doc in docs:
......@@ -104,15 +104,17 @@ def getParamFromDoc(doc):
else:
pdate = str(date.today().year) + '\t1\t1'
abstract = abstract.encode(encoding = 'UTF-8', errors = 'ignore').decode("utf-8").replace('\t', '').replace('"', '').replace('\n', '')
title = title.encode(encoding = 'UTF-8', errors = 'ignore').decode("utf-8").replace('\t', '').replace('"', '').replace('\n', '')
source = source.encode(encoding = 'UTF-8', errors = 'ignore').decode("utf-8").replace('\t', '').replace('"', '').replace('\n', '').replace('\n', '')
abstract = abstract.encode(encoding='UTF-8', errors='ignore').decode(
"utf-8").replace('\t', '').replace('"', '').replace('\n', '')
title = title.encode(encoding='UTF-8', errors='ignore').decode(
"utf-8").replace('\t', '').replace('"', '').replace('\n', '')
source = source.encode(encoding='UTF-8', errors='ignore').decode(
"utf-8").replace('\t', '').replace('"', '').replace('\n', '').replace('\n', '')
# Output
return str(title) + "\t" + source + "\t" + str(pdate) + "\t" + abstract + "\t" + authors + "\t" + str(1) + "\n"
def createTSV(docs):
# Output text
output = "title\tsource\tpublication_year\tpublication_month\tpublication_day\tabstract\tauthors\tweight\n"
......@@ -123,6 +125,7 @@ def createTSV(docs):
return output
def createTSVfromCollections():
# Output text
output = "title\tsource\tpublication_year\tpublication_month\tpublication_day\tabstract\tauthors\tweight\n"
......@@ -141,6 +144,7 @@ if 'stage' not in st.session_state:
st.session_state.id = ''
st.session_state.format = 'items'
def set_stage(stage):
st.session_state.stage = stage
st.session_state.id = st.session_state.idForm
......@@ -152,11 +156,14 @@ def set_stage(stage):
st.session_state.docsByKey = {}
st.session_state.select = False
def set_stage_collections(stage):
st.session_state.stage = stage
st.session_state.collectionsKey = []
for elem in st.session_state.collectionsForm:
st.session_state.collectionsKey.append(st.session_state.collections[elem])
st.session_state.collectionsKey.append(
st.session_state.collections[elem])
def set_stage_items(stage):
st.session_state.stage = stage
......@@ -166,13 +173,17 @@ def set_stage_items(stage):
if elem[1]:
st.session_state.result.append(elem[0])
def set_stage_minus():
st.session_state.stage -= 1
def saveKey():
for elem in st.session_state.keys():
if 'itemskey-' in elem:
st.session_state.docsByKey[elem.split('-')[1]][1] = st.session_state[elem]
st.session_state.docsByKey[elem.split(
'-')[1]][1] = st.session_state[elem]
def fill_docs():
st.session_state.docs = []
......@@ -180,10 +191,12 @@ def fill_docs():
if st.session_state.zotero_search in elem[0]['data']['title']:
st.session_state.docs.append(elem[0])
def selectAll():
st.session_state.select = not st.session_state.select
for elem in st.session_state.docs:
st.session_state.docsByKey[elem['data']['key']] = [elem, st.session_state.select]
st.session_state.docsByKey[elem['data']['key']] = [
elem, st.session_state.select]
def clear_docs():
......@@ -208,24 +221,22 @@ def switch_page(value):
fill_docs()
# Begin page
if st.session_state.stage == 0:
# Form
form = st.form('api')
form.write(st.session_state.general_text_dict['title'])
lst = ['items', 'collections']
st.session_state.id = form.text_input('ID',st.session_state.id, key='idForm', help=st.session_state.general_text_dict['help'])
st.session_state.format = form.selectbox(st.session_state.general_text_dict['data'], lst,lst.index(st.session_state.format), key='formatForm')
form.form_submit_button(st.session_state.general_text_dict['submit'], on_click=set_stage, args=(1,))
st.session_state.id = form.text_input(
'ID', st.session_state.id, key='idForm', help=st.session_state.general_text_dict['help'])
st.session_state.format = form.selectbox(
st.session_state.general_text_dict['data'], lst, lst.index(st.session_state.format), key='formatForm')
form.form_submit_button(
st.session_state.general_text_dict['submit'], on_click=set_stage, args=(1,))
#page for select items
# page for select items
if st.session_state.stage == 1 and st.session_state.format == 'items':
if (st.session_state.docsByKey == {}):
st.session_state.docs = getAllItems(st.session_state.id)
......@@ -234,36 +245,41 @@ if st.session_state.stage == 1 and st.session_state.format == 'items':
st.session_state.docsByKey[doc['data']['key']] = [doc, False]
clear_docs()
if st.session_state.docs == 0:
st.write(st.session_state.general_text_dict['denied'])
else:
st.write(st.session_state.general_text_dict['add_doc'])
st.checkbox(st.session_state.general_text_dict['select_all'],st.session_state.select, on_change=selectAll)
st.text_input(st.session_state.general_text_dict['search'], key='zotero_search', on_change=clear_docs)
st.checkbox(
st.session_state.general_text_dict['select_all'], st.session_state.select, on_change=selectAll)
st.text_input(
st.session_state.general_text_dict['search'], key='zotero_search', on_change=clear_docs)
min = st.session_state.page * sizepage
max = st.session_state.page * sizepage + sizepage
for doc in st.session_state.docs[min:max]:
st.checkbox(doc['data']['title'],st.session_state.docsByKey[doc['data']['key']][1], key='itemskey-'+doc['data']['key'])
st.checkbox(doc['data']['title'], st.session_state.docsByKey[doc['data']
['key']][1], key='itemskey-'+doc['data']['key'])
col1, col2 = st.columns(2)
with col1:
st.button(st.session_state.general_text_dict['submit'], on_click=set_stage_items, args=(2,))
st.button(
st.session_state.general_text_dict['submit'], on_click=set_stage_items, args=(2,))
with col2:
nb = int(st.session_state.nbdoc)//sizepage
if int(st.session_state.nbdoc)%sizepage == 0: # Fix the problem where the page end on the last file but there is still a page left
# Fix the problem where the page end on the last file but there is still a page left
if int(st.session_state.nbdoc) % sizepage == 0:
nb -= 1
if nb != 0:
col3, col4 = st.columns(2)
with col3:
if st.session_state.page != 0:
st.button(st.session_state.general_text_dict['p_page'], on_click=switch_page, args=('down',))
st.button(
st.session_state.general_text_dict['p_page'], on_click=switch_page, args=('down',))
with col4:
if st.session_state.page != nb:
st.button(st.session_state.general_text_dict['n_page'], on_click=switch_page, args=('up',))
st.button(
st.session_state.general_text_dict['n_page'], on_click=switch_page, args=('up',))
#page for select collections
# page for select collections
if st.session_state.stage == 1 and st.session_state.format == 'collections':
docs = loadApiCollections(st.session_state.id)
......@@ -274,22 +290,26 @@ if st.session_state.stage == 1 and st.session_state.format == 'collections':
st.session_state.collections = collections
form = st.form('collection')
form.write(st.session_state.general_text_dict['add_collect'])
form.multiselect(st.session_state.general_text_dict['chose_collect'], collections.keys(), key='collectionsForm')
form.form_submit_button(st.session_state.general_text_dict['submit'], on_click=set_stage_collections, args=(2,))
form.multiselect(st.session_state.general_text_dict['chose_collect'], collections.keys(
), key='collectionsForm')
form.form_submit_button(
st.session_state.general_text_dict['submit'], on_click=set_stage_collections, args=(2,))
#page for TSV items
# page for TSV items
if st.session_state.stage == 2 and st.session_state.format == 'items':
st.write(st.session_state.general_text_dict['fileTSV1'] + str(len(st.session_state.result)) + st.session_state.general_text_dict['fileTSV2'])
st.write(st.session_state.general_text_dict['fileTSV1'] + str(
len(st.session_state.result)) + st.session_state.general_text_dict['fileTSV2'])
output = createTSV(st.session_state.result)
st.download_button('Download TSV', output, 'output.csv')
#page for TSV collections
# page for TSV collections
if st.session_state.stage == 2 and st.session_state.format == 'collections':
output = createTSVfromCollections()
st.write(st.session_state.general_text_dict['fileTSV1'] + str(len(output.split('\n'))-2) + st.session_state.general_text_dict['fileTSV2'])
st.write(st.session_state.general_text_dict['fileTSV1'] + str(
len(output.split('\n'))-2) + st.session_state.general_text_dict['fileTSV2'])
st.download_button('Download TSV', output, 'output.csv')
if st.session_state.stage > 0:
st.button(st.session_state.general_text_dict['back'], on_click=set_stage_minus)
st.button(
st.session_state.general_text_dict['back'], on_click=set_stage_minus)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment