Commit fffc1c1b authored by Nicolas Atrax's avatar Nicolas Atrax

Update pages and add corrections for CleanCSVtoTSV and PDFtoTSV

parent a8323c73
...@@ -8,7 +8,7 @@ en,text,"Convert a CSV Harzing file into a TSV file compatible with GarganText" ...@@ -8,7 +8,7 @@ en,text,"Convert a CSV Harzing file into a TSV file compatible with GarganText"
fr,file,"Choisir un fichier" fr,file,"Choisir un fichier"
en,file,"Choose a file" en,file,"Choose a file"
fr,new_file,"Télécharge ton fichier TSV :" fr,new_file,"Téléchargez votre fichier TSV :"
en,new_file,"Download your TSV file :" en,new_file,"Download your TSV file :"
fr,submit," Soumettre " fr,submit," Soumettre "
......
...@@ -8,7 +8,7 @@ en,text,"Inspect a CSV file to check if it is compatible with GarganText." ...@@ -8,7 +8,7 @@ en,text,"Inspect a CSV file to check if it is compatible with GarganText."
fr,file,"Choisir un fichier" fr,file,"Choisir un fichier"
en,file,"Choose a file" en,file,"Choose a file"
fr,new_file,"Télécharge ton fichier TSV :" fr,new_file,"Téléchargez votre fichier TSV : "
en,new_file,"Download your TSV file : " en,new_file,"Download your TSV file : "
fr,error,"Erreur : le fichier n'est pas compatible avec GarganText" fr,error,"Erreur : le fichier n'est pas compatible avec GarganText"
......
...@@ -8,7 +8,7 @@ en,text,"Transform a Json corpus from GarganText to a TSV file for GarganText" ...@@ -8,7 +8,7 @@ en,text,"Transform a Json corpus from GarganText to a TSV file for GarganText"
fr,file,"Choisir un fichier" fr,file,"Choisir un fichier"
en,file,"Choose a file" en,file,"Choose a file"
fr,new_file,"Télécharge ton fichier TSV :" fr,new_file,"Téléchargez votre fichier TSV :"
en,new_file,"Download your TSV file:" en,new_file,"Download your TSV file:"
fr,error,"Erreur : le fichier n'est pas valide" fr,error,"Erreur : le fichier n'est pas valide"
......
...@@ -17,19 +17,19 @@ en,submit,"Submit" ...@@ -17,19 +17,19 @@ en,submit,"Submit"
fr,load_api,"Chargement de l'api..." fr,load_api,"Chargement de l'api..."
en,load_api,"Loading API..." en,load_api,"Loading API..."
fr,overload_api,"L'API est surchargé, relancer la requête dans quelques secondes" fr,overload_api,"L'API est surchargé, relancez la requête dans quelques secondes."
en,overload'api,"The API is overloaded, please retry the request in a few seconds" en,overload'api,"The API is overloaded, please retry the request in a few seconds."
fr,nb_doc,"Nombres de documents : " fr,nb_doc,"Nombres de documents : "
en,nb_doc,"Numbers of documents : " en,nb_doc,"Numbers of documents : "
fr,perform1,"Pour des raisons de performence, on limit à " fr,perform1,"Pour des raisons de performance, on limite à "
fr,perform2," le nombre de document maximum" fr,perform2," le nombre maximum de documents."
en,perform1,"For performance reasons, we limit to " en,perform1,"For performance reasons, we limit to "
en,perform2," the maximum number of documents" en,perform2," the maximum number of documents."
fr,nb_taken,"Nombres de documents à prendre" fr,nb_taken,"Nombres de documents à prendre"
en,nb_taken,"Number of documents to take" en,nb_taken,"Number of documents to take"
fr,createTSV,"Création du fichier TSV (Cela peut prendre quelques minutes)" fr,createTSV,"Création du fichier TSV (Cela peut prendre quelque minutes) ..."
en,createTSV,"Creation of the TSV file (It may take a while)" en,createTSV,"Creation of the TSV file (It may take a while) ..."
...@@ -2,7 +2,7 @@ locale,key,value ...@@ -2,7 +2,7 @@ locale,key,value
fr,title,"# Isidore vers GarganText" fr,title,"# Isidore vers GarganText"
en,title,"# Isidore To GarganText" en,title,"# Isidore To GarganText"
fr,text,"Effectue une recherche Isidore de documents scientifiques et les convertit en un fichier TSV." fr,text,"Effectue une recherche Isidore de documents scientifiques et les convertir en un fichier TSV."
en,text,"Do a Isidore scientific documents research and convert it into a TSV file." en,text,"Do a Isidore scientific documents research and convert it into a TSV file."
fr,keyword,"Mots clés" fr,keyword,"Mots clés"
...@@ -17,24 +17,24 @@ en,submit,"Submit" ...@@ -17,24 +17,24 @@ en,submit,"Submit"
fr,load_api,"Chargement de l'api..." fr,load_api,"Chargement de l'api..."
en,load_api,"Loading API..." en,load_api,"Loading API..."
fr,overload_api,"L'API est surchargé, relancer la requête dans quelques secondes." fr,overload_api,"L'API est surchargée, relancez la requête dans quelques secondes"
en,overload'api,"The API is overloaded, please retry the request in a few seconds." en,overload'api,"The API is overloaded, please retry the request in a few seconds"
fr,nb_doc,"Nombres de documents : " fr,nb_doc,"Nombres de documents : "
en,nb_doc,"Numbers of documents : " en,nb_doc,"Numbers of documents : "
fr,perform1,"Pour des raisons de performence, on limite à " fr,perform1,"Pour des raisons de performances, on limite à "
fr,perform2," le nombre maximum de documents." fr,perform2," le nombre de documents maximums"
en,perform1,"For performance reasons, we limit to " en,perform1,"For performance reasons, we limit to "
en,perform2," ,the maximum number of documents." en,perform2," the maximum number of documents"
fr,nb_taken,"Nombres de documents à prendre" fr,nb_taken,"Nombres de documents à prendre"
en,nb_taken,"Number of documents to take" en,nb_taken,"Number of documents to take"
fr,createTSV,"Création du fichier TSV (Cela peut prendre quelques minutes)" fr,createTSV,"Création du fichier TSV (Cela peut prendre quelque minutes)"
en,createTSV,"Creation of the TSV file (It may take a while)" en,createTSV,"Creation of the TSV file (It may take a while)"
fr,doc_abstract1,"Il y a " fr,doc_abstract1,"Il y a "
fr,doc_abstract2," documents qui peuvent ne pas avoir de description." fr,doc_abstract2," documents qui peuvent ne pas avoir de descriptions."
en,doc_abstract1,"There are " en,doc_abstract1,"There are "
en,doc_abstract2," documents who may not have an abstract" en,doc_abstract2," documents who may not have an abstract"
\ No newline at end of file
...@@ -9,5 +9,5 @@ en,text,"Input 2 term files from GarganText." ...@@ -9,5 +9,5 @@ en,text,"Input 2 term files from GarganText."
fr,file," Choisir un fichier " fr,file," Choisir un fichier "
en,file," Choose a file " en,file," Choose a file "
fr,new_file," Télécharge ton fichier fusionné " fr,new_file," Téléchargez la fusion de vos fichiers "
en,new_file," Download your merge file " en,new_file," Download your merge file "
\ No newline at end of file
...@@ -8,7 +8,7 @@ en,text,"Convert a PDF file into a TXT file" ...@@ -8,7 +8,7 @@ en,text,"Convert a PDF file into a TXT file"
fr,file,"Choisir un fichier" fr,file,"Choisir un fichier"
en,file,"Choose a file" en,file,"Choose a file"
fr,new_file,"Télécharge ton fichier TXT :" fr,new_file,"Téléchargez votre fichier TXT :"
en,new_file,"Download your TXT file: " en,new_file,"Download your TXT file: "
fr,watermark,"Filigrane : " fr,watermark,"Filigrane : "
......
...@@ -8,8 +8,8 @@ en,text,"Transform a pubmed corpus to a TSV file for GarganText" ...@@ -8,8 +8,8 @@ en,text,"Transform a pubmed corpus to a TSV file for GarganText"
fr,file,"Choisir un fichier" fr,file,"Choisir un fichier"
en,file,"Choose a file" en,file,"Choose a file"
fr,new_file,"Télécharge ton fichier TSV :" fr,new_file,"Télécharger votre fichier TSV :"
en,new_file,"Download your TSV file:" en,new_file,"Download your TSV file:"
fr,error,"Erreur : le fichier n'est pas valide" fr,error,"Erreur : le fichier n'est pas valide !"
en,error,"Error : the file isn't valid" en,error,"Error : the file isn't valid !"
\ No newline at end of file \ No newline at end of file
...@@ -8,7 +8,7 @@ en,text,"Transform a RIS corpus to a TSV file for GarganText" ...@@ -8,7 +8,7 @@ en,text,"Transform a RIS corpus to a TSV file for GarganText"
fr,file,"Choisir un fichier" fr,file,"Choisir un fichier"
en,file,"Choose a file" en,file,"Choose a file"
fr,new_file,"Télécharge ton fichier TSV :" fr,new_file,"Téléchargez votre fichier TSV :"
en,new_file,"Download your TSV file:" en,new_file,"Download your TSV file:"
fr,error,"Erreur : le fichier n'est pas valide" fr,error,"Erreur : le fichier n'est pas valide"
......
...@@ -14,7 +14,7 @@ en,text3,"You can choose the title and the author(s) for this TXT." ...@@ -14,7 +14,7 @@ en,text3,"You can choose the title and the author(s) for this TXT."
fr,file,"Choisir un fichier" fr,file,"Choisir un fichier"
en,file,"Choose a file" en,file,"Choose a file"
fr,new_file,"Télécharge ton fichier TSV :" fr,new_file,"Téléchargez votre fichier TSV :"
en,new_file,"Download your TSV file : " en,new_file,"Download your TSV file : "
fr,author,"Auteur(s) : " fr,author,"Auteur(s) : "
......
...@@ -11,7 +11,7 @@ en,text2,"This tool detect automatically the languages of the PDF and translate ...@@ -11,7 +11,7 @@ en,text2,"This tool detect automatically the languages of the PDF and translate
fr,file,"Choisir un fichier" fr,file,"Choisir un fichier"
en,file,"Choose a file" en,file,"Choose a file"
fr,new_file,"Télécharge ton fichier TSV traduit:" fr,new_file,"Téléchargez votre fichier TSV traduit:"
en,new_file,"Download your translated TSV file : " en,new_file,"Download your translated TSV file : "
fr,submit," Soumettre " fr,submit," Soumettre "
......
...@@ -35,5 +35,5 @@ en,loading,"Videos processing : " ...@@ -35,5 +35,5 @@ en,loading,"Videos processing : "
fr,quantity," sur " fr,quantity," sur "
en,quantity," out of " en,quantity," out of "
fr,new_file,"Télécharge ton fichier TSV :" fr,new_file,"Téléchargez votre fichier TSV :"
en,new_file,"Download your TSV file :" en,new_file,"Download your TSV file :"
...@@ -11,11 +11,11 @@ en,help,"Find your user ID here: https://www.zotero.org/settings/keys" ...@@ -11,11 +11,11 @@ en,help,"Find your user ID here: https://www.zotero.org/settings/keys"
fr,submit,"Suivant" fr,submit,"Suivant"
en,submit,"Submit" en,submit,"Submit"
fr,denied,"L'acèss au compte n'est pas publique, pour la mettre publique: https://www.zotero.org/settings/privacy" fr,denied,"L'acèss au compte n'est pas public, pour le mettre en public: https://www.zotero.org/settings/privacy"
en,denied,"Account access is not public, to make it public: https://www.zotero.org/settings/privacy" en,denied,"Account access is not public, to make it public: https://www.zotero.org/settings/privacy"
fr,add_doc,"*Ajouter les documents que vous voulez mettre dans le TSV*" fr,add_doc,"*Ajoutez les documents que vous voulez mettre dans le TSV*"
en,add_doc,"*Add the document that tou want in the TSV*" en,add_doc,"*Add the document that you want in the TSV*"
fr,select_all,"Select All" fr,select_all,"Select All"
en,select_all,"Select All" en,select_all,"Select All"
...@@ -29,11 +29,11 @@ en,p_page,"Previous Page" ...@@ -29,11 +29,11 @@ en,p_page,"Previous Page"
fr,n_page,"Page Suivante" fr,n_page,"Page Suivante"
en,n_page,"Next Page" en,n_page,"Next Page"
fr,add_collect,"**Selectionner une collection** vous pouvez en choisir plusieurs" fr,add_collect,"**Sélectionnez une collection** vous pouvez en choisir plusieurs"
en,add_collect,"**Chose a collection** you can choose multiple one" en,add_collect,"**Chose a collection** you can choose multiple one"
fr,chose_collect,"Choisie une collection" fr,chose_collect,"Choisir une collection"
en,chose_collect,"Chose a collection" en,chose_collect,"Choose a collection"
fr,fileTSV1,"Le TSV contient " fr,fileTSV1,"Le TSV contient "
fr,fileTSV2," documents" fr,fileTSV2," documents"
......
...@@ -39,54 +39,29 @@ def getSeparator(file): ...@@ -39,54 +39,29 @@ def getSeparator(file):
return '\t', False return '\t', False
def checkPublicationCase(tmp, split, success): def lowerName(name):
if split: tmp = name
if tmp[0][0].isupper() or tmp[1][0].isupper():
return False
else:
return success
if not tmp[0][0].isupper() or not tmp[1][0].isupper():
return False
return success
def checkPublication(name, registeredNames, errorMessage):
tmpName = name
if re.search('[a-zA-Z0-9]', name[0]) == None: if re.search('[a-zA-Z0-9]', name[0]) == None:
tmpName = name[1:] tmp = name[1:]
tmp = tmpName.split(' ') if len(tmp) < 9:
success = True return tmp.lower()
tmp = name.split(' ')
split = False split = False
first = "" first = ""
second = "" second = ""
if "_" in tmp[0] and len(tmp) == 1: if len(tmp) == 1 and "_" in tmp[0]:
tmp = tmp[0].split('_') tmp = tmp[0].split('_')
split = True split = True
if len(tmp) != 2: if len(tmp) != 2:
success = False return name.lower()
else: else:
success = checkPublicationCase(tmp, split, success)
first = tmp[0][0].lower() + tmp[0][1:] first = tmp[0][0].lower() + tmp[0][1:]
second = tmp[1][0].lower() + tmp[1][1:] second = tmp[1][0].lower() + tmp[1][1:]
if first != "publication" or second not in ["day", "month", "year"]: return first + "_" + second
success = False
if not success:
errorMessage += "Error at line 1 ! Wrong name : " + \
name + " is not appropriated ! \n"
else:
registeredNames.append(first + "_" + second)
return success, errorMessage
def checkNameValidity(name, columnNames, registeredNames, errorMessage): def checkNameValidity(name, columnNames, registeredNames, errorMessage):
tmpName = name if name in registeredNames:
if re.search('[a-zA-Z0-9]', name[0]) == None:
tmpName = name[1:]
if tmpName not in columnNames:
errorMessage += "Error at line 1 ! Wrong name : " + \
name + " is not appropriated ! \n"
return False, errorMessage
if tmpName in registeredNames:
errorMessage += "Error at line 1 ! Same name for 2 differents columns! \n" errorMessage += "Error at line 1 ! Same name for 2 differents columns! \n"
return False, errorMessage return False, errorMessage
return True, errorMessage return True, errorMessage
...@@ -105,23 +80,30 @@ def checkColumnExistence(registeredNames, errorMessage): ...@@ -105,23 +80,30 @@ def checkColumnExistence(registeredNames, errorMessage):
return True, errorMessage return True, errorMessage
def checkColumnNames(name, errorMessage, registeredNames, success): def checkColumnNames(name, errorMessage, registeredNames, otherColumns, success):
columnNames = ["authors", "title", "publication_year", columnNames = ["authors", "title", "source", "publication_year",
"publication_month", "publication_day", "abstract", "source"] "publication_month", "publication_day", "abstract"]
name = name.replace("\n", "") name = name.replace("\n", "")
if len(name) > 9: tmpSuccess, errorMessage = checkNameValidity(name, columnNames, registeredNames, errorMessage)
tmpSuccess, errorMessage = checkPublication( if tmpSuccess:
name, registeredNames, errorMessage) if lowerName(name) in columnNames:
else: registeredNames.append(name)
name = name.replace(" ", "") else :
tmpSuccess, errorMessage = checkNameValidity( otherColumns.append(name)
name[0].lower() + name[1:], columnNames, registeredNames, errorMessage) if success :
if tmpSuccess:
registeredNames.append(name[0].lower() + name[1:])
if success:
success = tmpSuccess success = tmpSuccess
return success, errorMessage, registeredNames return errorMessage, registeredNames, otherColumns, success
def addColumnsNamestoTSV(data, registeredNames, otherColumns):
for name in registeredNames :
if data != "":
data += "\t"
data += name
for name in otherColumns :
data += "\t"
data += name
return data
def getColumnsNames(file, separator, errorMessage): def getColumnsNames(file, separator, errorMessage):
data = "" data = ""
...@@ -130,40 +112,21 @@ def getColumnsNames(file, separator, errorMessage): ...@@ -130,40 +112,21 @@ def getColumnsNames(file, separator, errorMessage):
success = True success = True
reader = csv.DictReader(codecs.iterdecode( reader = csv.DictReader(codecs.iterdecode(
file, 'utf-8'), delimiter=separator) file, 'utf-8'), delimiter=separator)
columnsNames = [] othersColumns = []
for row in reader: for row in reader:
for name, value in row.items(): for name, value in row.items():
columnName = name.replace("\ufeff", "") columnName = name.replace("\ufeff", "")
if (columnNb < 7): errorMessage, registeredNames, otherColumns, success = checkColumnNames(
success, errorMessage, registeredNames = checkColumnNames( name, errorMessage, registeredNames, othersColumns, success)
name, errorMessage, registeredNames, success)
if data != "":
data += "\t"
data += columnName
columnNb += 1
success, errorMessage = checkColumnExistence( success, errorMessage = checkColumnExistence(
registeredNames, errorMessage) registeredNames, errorMessage)
if success :
data = addColumnsNamestoTSV(data, registeredNames, otherColumns)
break break
data += "\n" data += "\n"
return data, success, errorMessage return data, success, errorMessage
def lowerName(name):
tmp = name.split(' ')
split = False
first = ""
second = ""
if len(tmp) == 1 and "_" in tmp[0]:
tmp = tmp[0].split('_')
split = True
if len(tmp) != 2:
return name.lower()
else:
first = tmp[0][0].lower() + tmp[0][1:]
second = tmp[1][0].lower() + tmp[1][1:]
return first + "_" + second
def checkDate(name, value, success, fill, csvLine, errorMessage): def checkDate(name, value, success, fill, csvLine, errorMessage):
if name in ["publication_year", "publication_month", "publication_day"]: if name in ["publication_year", "publication_month", "publication_day"]:
if value == "" or value == "\n": if value == "" or value == "\n":
...@@ -210,43 +173,45 @@ def correctedSequence(text): ...@@ -210,43 +173,45 @@ def correctedSequence(text):
tmp = "\"" + tmp + "\"" tmp = "\"" + tmp + "\""
return tmp return tmp
def getContent(file, separator, data, success, fill, errorMessage): def getContent(file, separator, data, success, fill, errorMessage):
reader = csv.DictReader(codecs.iterdecode( columnNames = ["authors", "title", "source", "publication_year",
file, 'utf-8'), delimiter=separator) "publication_month", "publication_day", "abstract"]
csvLine = 2 csvLine = 2
columnNb = 0 reader = csv.DictReader(codecs.iterdecode(file, 'utf-8'), delimiter=separator)
for row in reader: for row in reader:
tmp = ""
first = True first = True
tsv1 = ""
tsv2 = ""
for name, value in row.items(): for name, value in row.items():
tmpFill = "" tmpFill = ""
if not first: if lowerName(name) in columnNames:
tmp += "\t" if not first :
else: tsv1 += "\t"
first = False
if (columnNb < 7):
success, tmpFill, errorMessage = checkMissing( success, tmpFill, errorMessage = checkMissing(
lowerName(name), value, success, fill, csvLine, errorMessage) lowerName(name), value, success, fill, csvLine, errorMessage)
if tmpFill != "": if tmpFill != "":
tmp += tmpFill tsv1 += tmpFill
else: else:
success, tmpFill, errorMessage = checkDate( success, tmpFill, errorMessage = checkDate(
lowerName(name), value, success, fill, csvLine, errorMessage) lowerName(name), value, success, fill, csvLine, errorMessage)
tmp += correctedSequence(value) tsv1 += correctedSequence(value)
else: else :
tmp += correctedSequence(value) success, tmpFill, errorMessage = checkMissing(
columnNb += 1 lowerName(name), value, success, fill, csvLine, errorMessage)
columnNb = 0 if tmpFill != "":
tsv2 +="\t" + tmpFill
else:
tsv2 += "\t" + correctedSequence(value)
if first:
first = False
csvLine += 1 csvLine += 1
data += tmp + "\n" data += tsv1 + tsv2 + "\n"
return data[:-1], success, errorMessage return data[:-1], success, errorMessage
# Code End # Code End
st.write(st.session_state.general_text_dict['text']) st.write(st.session_state.general_text_dict['text'])
st.session_state.fill = st.checkbox(st.session_state.general_text_dict['fill']) st.session_state.fill = st.checkbox(value = True, label = st.session_state.general_text_dict['fill'])
file = st.file_uploader( file = st.file_uploader(
st.session_state.general_text_dict['file'], type=["tsv", "csv"], key='file') st.session_state.general_text_dict['file'], type=["tsv", "csv"], key='file')
......
...@@ -190,7 +190,5 @@ if st.session_state.stage_isidore > 1: ...@@ -190,7 +190,5 @@ if st.session_state.stage_isidore > 1:
print(st.session_state.nb_wanted) print(st.session_state.nb_wanted)
st.session_state.output = create_output( st.session_state.output = create_output(
st.session_state.search, lang[st.session_state.language], st.session_state.nb_wanted) st.session_state.search, lang[st.session_state.language], st.session_state.nb_wanted)
st.download_button('Download TSV', st.session_state.output, 'output.csv')
fileName = "HALOutput_" + str(datetime.now().strftime("%Y-%m-%d_%H:%M:%S")) + '.csv'
st.download_button('Download TSV', st.session_state.output, fileName)
...@@ -7,7 +7,6 @@ import streamlit as st ...@@ -7,7 +7,6 @@ import streamlit as st
import requests as req import requests as req
import json import json
import time import time
from datetime import datetime
from json import JSONDecodeError from json import JSONDecodeError
import src.basic as tmp import src.basic as tmp
...@@ -65,16 +64,11 @@ def create_output(search, language, nb_doc): ...@@ -65,16 +64,11 @@ def create_output(search, language, nb_doc):
break break
time.sleep(retryTime) time.sleep(retryTime)
print('Retry') print('Retry')
tmp, nb_tmp = createFile(txt, numberReplies, language) tmp, nb_tmp = createFile(txt, nb_doc % numberReplies, language)
output += tmp output += tmp
nb += nb_tmp nb += nb_tmp
if nb_doc % numberReplies != 0: if nb_doc % numberReplies != 0:
while (True): txt = loadApiIsidorePage(search, language, nb_doc//numberReplies + 1)
txt = loadApiIsidorePage(search, language, nb_doc//numberReplies + 1)
if txt != 0:
break
time.sleep(retryTime)
print('Retry')
tmp, nb_tmp = createFile(txt, nb_doc % numberReplies, language) tmp, nb_tmp = createFile(txt, nb_doc % numberReplies, language)
output += tmp output += tmp
nb += nb_tmp nb += nb_tmp
...@@ -145,16 +139,12 @@ def createFile(docs, limit, language): ...@@ -145,16 +139,12 @@ def createFile(docs, limit, language):
else: else:
abstract = tmp abstract = tmp
else: else:
if '$' in abstract.keys(): abstract = abstract['$']
abstract = abstract['$']
else:
abstract = ''
if 'types' in doc['isidore'].keys(): if 'types' in doc['isidore'].keys():
print(i) if type(doc['isidore']['types']['type'] == str) and doc['isidore']['types']['type'] in ['Books', 'text']:
if type(doc['isidore']['types']['type']) == str and doc['isidore']['types']['type'] in ['Books', 'text']:
nb += 1 nb += 1
elif type(doc['isidore']['types']['type']) == dict and doc['isidore']['types']['type']['$'] in ['Books', 'text']: elif type(doc['isidore']['types']['type'] == dict) and doc['isidore']['types']['type'][1] in ['Books', 'text']:
nb += 1 nb += 1
else: else:
print(title) print(title)
...@@ -290,5 +280,4 @@ if st.session_state.stage_isidore > 1: ...@@ -290,5 +280,4 @@ if st.session_state.stage_isidore > 1:
st.write(st.session_state.general_text_dict['doc_abstract1'] + str( st.write(st.session_state.general_text_dict['doc_abstract1'] + str(
st.session_state.nb_bad_file) + st.session_state.general_text_dict['doc_abstract2']) st.session_state.nb_bad_file) + st.session_state.general_text_dict['doc_abstract2'])
fileName = "isidoreOutput_" + str(datetime.now().strftime("%Y-%m-%d_%H:%M:%S")) + '.csv' st.download_button('Download TSV', st.session_state.output, 'output.csv')
st.download_button('Download TSV', st.session_state.output, fileName)
...@@ -5,7 +5,7 @@ Loïc Chapron ...@@ -5,7 +5,7 @@ Loïc Chapron
import json import json
import pandas as pd import pandas as pd
from datetime import datetime import datetime
import zipfile import zipfile
import streamlit as st import streamlit as st
import src.basic as tmp import src.basic as tmp
...@@ -60,6 +60,8 @@ def read_zip(zip_file): ...@@ -60,6 +60,8 @@ def read_zip(zip_file):
temp["publication_year"] = article["publicationDate"][0] temp["publication_year"] = article["publicationDate"][0]
except: except:
temp["publication_year"] = datetime.date.today().year temp["publication_year"] = datetime.date.today().year
temp["publication_year"] = article.get(
"publicationDate", datetime.date.today().year)[0]
temp["publication_month"] = 1 temp["publication_month"] = 1
temp["publication_day"] = 1 temp["publication_day"] = 1
...@@ -89,13 +91,13 @@ file = st.file_uploader( ...@@ -89,13 +91,13 @@ file = st.file_uploader(
if file: if file:
try: try:
fileName = "istexOutput_" + str(datetime.now().strftime("%Y-%m-%d_%H:%M:%S")) + '.csv' name = file.name.split('.')[0] + '.csv'
res, nb_dup = read_zip(file) res, nb_dup = read_zip(file)
if nb_dup: if nb_dup:
st.write(st.session_state.general_text_dict['dup1'] + str( st.write(st.session_state.general_text_dict['dup1'] + str(
nb_dup) + st.session_state.general_text_dict['dup2']) nb_dup) + st.session_state.general_text_dict['dup2'])
st.write(st.session_state.general_text_dict['new_file']) st.write(st.session_state.general_text_dict['new_file'])
st.download_button('Download TSV', res, fileName) st.download_button(name, res, name)
except Exception as e: except Exception as e:
st.write(st.session_state.general_text_dict['error']) st.write(st.session_state.general_text_dict['error'])
print(e) print(e)
......
...@@ -13,6 +13,8 @@ import re ...@@ -13,6 +13,8 @@ import re
import chardet import chardet
import pandas as pd import pandas as pd
import streamlit as st import streamlit as st
import lib.tika.tika as tika
tika.initVM()
from lib.tika.tika import parser from lib.tika.tika import parser
from lib.langdetect.langdetect import detect from lib.langdetect.langdetect import detect
from lib.langdetect.langdetect.lang_detect_exception import LangDetectException from lib.langdetect.langdetect.lang_detect_exception import LangDetectException
...@@ -136,7 +138,7 @@ def segmentAbstract(fileName, fileAddress, tsv, author, source, year, month, day ...@@ -136,7 +138,7 @@ def segmentAbstract(fileName, fileAddress, tsv, author, source, year, month, day
count = 1 count = 1
languages = {} languages = {}
while n < nbLines - 2: while n < nbLines - 2:
doc = "\n".join(abstract[n: n + 9]).replace("�", "") doc = "\n".join(abstract[n: n + 9]).replace("�", "").replace("", "")
title = source + " : Part " + str(count) title = source + " : Part " + str(count)
tsv += correctedSequence(author, False) + "\t" + correctedSequence( tsv += correctedSequence(author, False) + "\t" + correctedSequence(
source, False) + "\t" + year + "\t" + month + "\t" + day + "\t" source, False) + "\t" + year + "\t" + month + "\t" + day + "\t"
......
...@@ -6,7 +6,7 @@ Loïc Chapron ...@@ -6,7 +6,7 @@ Loïc Chapron
import streamlit as st import streamlit as st
import requests as req import requests as req
import json import json
from datetime import date, datetime from datetime import date
import src.basic as tmp import src.basic as tmp
...@@ -308,8 +308,7 @@ if st.session_state.stage == 2 and st.session_state.format == 'collections': ...@@ -308,8 +308,7 @@ if st.session_state.stage == 2 and st.session_state.format == 'collections':
output = createTSVfromCollections() output = createTSVfromCollections()
st.write(st.session_state.general_text_dict['fileTSV1'] + str( st.write(st.session_state.general_text_dict['fileTSV1'] + str(
len(output.split('\n'))-2) + st.session_state.general_text_dict['fileTSV2']) len(output.split('\n'))-2) + st.session_state.general_text_dict['fileTSV2'])
fileName = "zoteroOutput_" + str(datetime.now().strftime("%Y-%m-%d_%H:%M:%S")) + '.csv' st.download_button('Download TSV', output, 'output.csv')
st.download_button('Download TSV', output, fileName)
if st.session_state.stage > 0: if st.session_state.stage > 0:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment