Commit cf004a85 authored by Atrax Nicolas's avatar Atrax Nicolas

Update pages

parent 879a0266
...@@ -10,48 +10,47 @@ import src.basic as tmp ...@@ -10,48 +10,47 @@ import src.basic as tmp
tmp.base("GarganTextJsonToTSV") tmp.base("GarganTextJsonToTSV")
def getText(corpusJson):
output = "title\tsource\tpublication_year\tpublication_month\tpublication_day\tabstract\tauthors\tweight\n"
def getText(corpusJson):
for row in corpusJson['corpus'] : output = "title\tsource\tpublication_year\tpublication_month\tpublication_day\tabstract\tauthors\tweight\n"
doc = row['document']['hyperdata']
abstract = "empty"
authors = "empty"
title = "empty"
source = "empty"
if 'title' in doc.keys() : for row in corpusJson['corpus']:
title = doc['title'].replace('"','').replace('\t','') doc = row['document']['hyperdata']
abstract = "empty"
authors = "empty"
title = "empty"
source = "empty"
if 'source' in doc.keys() : if 'title' in doc.keys():
source = doc['source'].replace('"','').replace('\t','') title = doc['title'].replace('"', '').replace('\t', '')
if 'abstract' in doc.keys() : if 'source' in doc.keys():
abstract = doc['abstract'].replace('"','').replace('\t','') source = doc['source'].replace('"', '').replace('\t', '')
if 'abstract' in doc.keys():
abstract = doc['abstract'].replace('"', '').replace('\t', '')
if 'authors' in doc.keys() : if 'authors' in doc.keys():
authors = doc['authors'] authors = doc['authors']
output += title + "\t" + source + "\t" + str(doc['publication_year']) + "\t" + str(
doc['publication_month']) + "\t" + str(doc['publication_day']) + "\t" + abstract + "\t" + authors + "\t" + str(1) + "\n"
output += title + "\t" + source + "\t" + str(doc['publication_year']) + "\t" + str(doc['publication_month']) + "\t" + str(doc['publication_day']) + "\t" + abstract + "\t" + authors + "\t" + str(1) + "\n" return output
return output
st.write(st.session_state.general_text_dict['text']) st.write(st.session_state.general_text_dict['text'])
file = st.file_uploader(st.session_state.general_text_dict['file'],type=["json"],key='file') file = st.file_uploader(
st.session_state.general_text_dict['file'], type=["json"], key='file')
if file: if file:
try: try:
name = file.name.split('.')[0] + '.csv' name = file.name.split('.')[0] + '.csv'
df = pd.read_json(file) df = pd.read_json(file)
st.write(st.session_state.general_text_dict['new_file']) st.write(st.session_state.general_text_dict['new_file'])
st.download_button(name, getText(df), name) st.download_button(name, getText(df), name)
except Exception as e: except Exception as e:
st.write(st.session_state.general_text_dict['error']) st.write(st.session_state.general_text_dict['error'])
print(e) file.close()
file.close()
...@@ -217,7 +217,6 @@ with col1: ...@@ -217,7 +217,6 @@ with col1:
with col2: with col2:
st.image('img/gargantext_logo.jpg') st.image('img/gargantext_logo.jpg')
# Form # Form
form = st.form('api') form = st.form('api')
......
...@@ -15,63 +15,66 @@ tmp.base("IstexToGarganText") ...@@ -15,63 +15,66 @@ tmp.base("IstexToGarganText")
def read_zip(zip_file): def read_zip(zip_file):
output=[] output = []
dup = 0 dup = 0
with zipfile.ZipFile(zip_file, 'r') as zip_ref: with zipfile.ZipFile(zip_file, 'r') as zip_ref:
for file in zip_ref.namelist(): for file in zip_ref.namelist():
if file.split('.')[1] != 'json' or file.split('.')[0] == 'manifest': if file.split('.')[1] != 'json' or file.split('.')[0] == 'manifest':
continue continue
try: try:
with zip_ref.open(file) as f: with zip_ref.open(file) as f:
data = json.load(f) data = json.load(f)
article=pd.json_normalize(data) article = pd.json_normalize(data)
f.close() f.close()
temp={} temp = {}
temp["title"]=article.get("title", '')[0].encode(encoding = 'UTF-8', errors = 'ignore').decode("utf-8").replace("\t", " ") temp["title"] = article.get("title", '')[0].encode(
encoding='UTF-8', errors='ignore').decode("utf-8").replace("\t", " ")
try: try:
temp["abstract"]=article.get("abstract","")[0].encode(encoding = 'UTF-8', errors = 'ignore').decode("utf-8").replace("\t", " ") temp["abstract"] = article.get("abstract", "")[0].encode(
encoding='UTF-8', errors='ignore').decode("utf-8").replace("\t", " ")
except Exception as e: except Exception as e:
temp["abstract"] = '' temp["abstract"] = ''
try: try:
authors="" authors = ""
for author in article["author"][0]: for author in article["author"][0]:
authors+=author["name"]+"; " authors += author["name"]+"; "
authors=authors[:-2] authors = authors[:-2]
except: except:
author = '' author = ''
temp["code"] = article.get("_id")[0] temp["code"] = article.get("_id")[0]
temp["authors"]=authors.encode(encoding = 'UTF-8', errors = 'ignore').decode("utf-8").replace("\t", " ") temp["authors"] = authors.encode(
encoding='UTF-8', errors='ignore').decode("utf-8").replace("\t", " ")
try: try:
temp["source"]=article.get('host.title')[0].encode(encoding = 'UTF-8', errors = 'ignore').decode("utf-8").replace("\t", " ") temp["source"] = article.get('host.title')[0].encode(
encoding='UTF-8', errors='ignore').decode("utf-8").replace("\t", " ")
except: except:
temp["source"] = '' temp["source"] = ''
try: try:
temp["publication_year"]=article["publicationDate"][0] temp["publication_year"] = article["publicationDate"][0]
except: except:
temp["publication_year"]=datetime.date.today().year temp["publication_year"] = datetime.date.today().year
temp["publication_year"]=article.get("publicationDate", datetime.date.today().year)[0] temp["publication_year"] = article.get(
temp["publication_month"]=1 "publicationDate", datetime.date.today().year)[0]
temp["publication_day"]=1 temp["publication_month"] = 1
temp["publication_day"] = 1
output.append(temp) output.append(temp)
except Exception as e: except Exception as e:
dup += 1 dup += 1
zip_ref.close() zip_ref.close()
output=pd.DataFrame(output) output = pd.DataFrame(output)
duplicated = output['title'].str.lower().replace(",", "", regex=True).duplicated() duplicated = output['title'].str.lower().replace(
",", "", regex=True).duplicated()
if (duplicated.any()): if (duplicated.any()):
dup += duplicated.sum() dup += duplicated.sum()
...@@ -80,18 +83,22 @@ def read_zip(zip_file): ...@@ -80,18 +83,22 @@ def read_zip(zip_file):
df = pd.DataFrame(output) df = pd.DataFrame(output)
return df.to_csv(index=False, sep='\t'), dup return df.to_csv(index=False, sep='\t'), dup
st.write(st.session_state.general_text_dict['text']) st.write(st.session_state.general_text_dict['text'])
file = st.file_uploader(st.session_state.general_text_dict['file'],type=["zip"],key='file')
file = st.file_uploader(
st.session_state.general_text_dict['file'], type=["zip"], key='file')
if file: if file:
try: try:
name = file.name.split('.')[0] + '.csv' name = file.name.split('.')[0] + '.csv'
res, nb_dup = read_zip(file) res, nb_dup = read_zip(file)
if nb_dup: if nb_dup:
st.write(st.session_state.general_text_dict['dup1'] + str(nb_dup) + st.session_state.general_text_dict['dup2']) st.write(st.session_state.general_text_dict['dup1'] + str(
nb_dup) + st.session_state.general_text_dict['dup2'])
st.write(st.session_state.general_text_dict['new_file']) st.write(st.session_state.general_text_dict['new_file'])
st.download_button(name, res, name) st.download_button(name, res, name)
except Exception as e: except Exception as e:
st.write(st.session_state.general_text_dict['error']) st.write(st.session_state.general_text_dict['error'])
print(e) print(e)
file.close() file.close()
\ No newline at end of file
...@@ -68,6 +68,7 @@ def read_file(file): ...@@ -68,6 +68,7 @@ def read_file(file):
output += row output += row
return output return output
st.write(st.session_state.general_text_dict['text']) st.write(st.session_state.general_text_dict['text'])
file = st.file_uploader( file = st.file_uploader(
st.session_state.general_text_dict['file'], type=["txt"], key='file') st.session_state.general_text_dict['file'], type=["txt"], key='file')
......
...@@ -57,7 +57,6 @@ def read_file(file): ...@@ -57,7 +57,6 @@ def read_file(file):
return output return output
st.write(st.session_state.general_text_dict['title'])
st.write(st.session_state.general_text_dict['text']) st.write(st.session_state.general_text_dict['text'])
file = st.file_uploader( file = st.file_uploader(
st.session_state.general_text_dict['file'], type=["ris"], key='file') st.session_state.general_text_dict['file'], type=["ris"], key='file')
...@@ -70,5 +69,4 @@ if file: ...@@ -70,5 +69,4 @@ if file:
st.download_button(name, read_file(file), name) st.download_button(name, read_file(file), name)
except Exception as e: except Exception as e:
st.write(st.session_state.general_text_dict['error']) st.write(st.session_state.general_text_dict['error'])
print(e)
file.close() file.close()
...@@ -178,6 +178,7 @@ def transcriptToTsv(search, nbVideos): ...@@ -178,6 +178,7 @@ def transcriptToTsv(search, nbVideos):
bar = st.progress(count / nbVideos, dict['loading'] + bar = st.progress(count / nbVideos, dict['loading'] +
str(count) + dict['quantity'] + str(nbVideos)) str(count) + dict['quantity'] + str(nbVideos))
for video in videos: for video in videos:
print(count)
bar.progress(count / nbVideos, dict['loading'] + bar.progress(count / nbVideos, dict['loading'] +
str(count) + dict['quantity'] + str(nbVideos)) str(count) + dict['quantity'] + str(nbVideos))
if count == nbVideos: if count == nbVideos:
......
...@@ -5,6 +5,17 @@ from st_pages import show_pages_from_config, add_indentation ...@@ -5,6 +5,17 @@ from st_pages import show_pages_from_config, add_indentation
def base(page): def base(page):
st.markdown(
f'''
<style>
.reportview-container .sidebar-content {{
padding-top: {1}rem;
}}
.reportview-container .main .block-container {{
padding-top: {1}rem;
}}
</style>
''', unsafe_allow_html=True)
show_pages_from_config() show_pages_from_config()
add_indentation() add_indentation()
...@@ -23,8 +34,8 @@ def base(page): ...@@ -23,8 +34,8 @@ def base(page):
</style> </style>
""", unsafe_allow_html=True) """, unsafe_allow_html=True)
# Load the language file # Load the language file
def load_bundle(lang): def load_bundle(lang):
df = pd.read_csv("lang/text_" + page + ".csv") df = pd.read_csv("lang/text_" + page + ".csv")
df = df.query(f"locale == '{lang}'") df = df.query(f"locale == '{lang}'")
...@@ -33,26 +44,31 @@ def base(page): ...@@ -33,26 +44,31 @@ def base(page):
for i in range(len(df)): for i in range(len(df)):
tmp[df.key.to_list()[i]] = df.value.to_list()[i] tmp[df.key.to_list()[i]] = df.value.to_list()[i]
return tmp return tmp
# Load the language file # Load the language file
def update_lang(): def update_lang():
st.session_state.general_text_dict = load_bundle(st.session_state.general_lang_dict[st.session_state.general_language]) st.session_state.general_text_dict = load_bundle(
st.session_state.general_lang_dict[st.session_state.general_language])
# Test if it's first connection on page or else if the last page was this one # Test if it's first connection on page or else if the last page was this one
if 'general_session_page' not in st.session_state.keys(): if 'general_session_page' not in st.session_state.keys():
st.session_state.general_lang_dict = {'Français' : 'fr', 'English': 'en'} st.session_state.general_lang_dict = {
'Français': 'fr', 'English': 'en'}
st.session_state.general_text_dict = load_bundle('fr') st.session_state.general_text_dict = load_bundle('fr')
st.session_state.general_language = 'Français' st.session_state.general_language = 'Français'
st.session_state.general_session_page = page st.session_state.general_session_page = page
elif st.session_state.general_session_page != page: elif st.session_state.general_session_page != page:
st.session_state.general_text_dict = load_bundle(st.session_state.general_lang_dict[st.session_state.general_language]) st.session_state.general_text_dict = load_bundle(
st.session_state.general_lang_dict[st.session_state.general_language])
st.session_state.general_session_page = page st.session_state.general_session_page = page
# Delete every key who aren't fron this file # Delete every key who aren't fron this file
for key in st.session_state.keys(): for key in st.session_state.keys():
if 'general_' not in key: if 'general_' not in key:
del st.session_state[key] del st.session_state[key]
st.write(st.session_state.general_text_dict['title'])
# select the lang # select the lang
st.selectbox('Langue', list(st.session_state.general_lang_dict.keys()), list(st.session_state.general_lang_dict.keys()).index(st.session_state.general_language),key='general_language', on_change=update_lang) st.selectbox('Langue', list(st.session_state.general_lang_dict.keys()), list(st.session_state.general_lang_dict.keys(
)).index(st.session_state.general_language), key='general_language', on_change=update_lang)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment