Commit cf004a85 authored by Atrax Nicolas's avatar Atrax Nicolas

Update pages

parent 879a0266
......@@ -10,48 +10,47 @@ import src.basic as tmp
tmp.base("GarganTextJsonToTSV")
def getText(corpusJson):
output = "title\tsource\tpublication_year\tpublication_month\tpublication_day\tabstract\tauthors\tweight\n"
def getText(corpusJson):
for row in corpusJson['corpus'] :
doc = row['document']['hyperdata']
abstract = "empty"
authors = "empty"
title = "empty"
source = "empty"
output = "title\tsource\tpublication_year\tpublication_month\tpublication_day\tabstract\tauthors\tweight\n"
if 'title' in doc.keys() :
title = doc['title'].replace('"','').replace('\t','')
for row in corpusJson['corpus']:
doc = row['document']['hyperdata']
abstract = "empty"
authors = "empty"
title = "empty"
source = "empty"
if 'source' in doc.keys() :
source = doc['source'].replace('"','').replace('\t','')
if 'title' in doc.keys():
title = doc['title'].replace('"', '').replace('\t', '')
if 'abstract' in doc.keys() :
abstract = doc['abstract'].replace('"','').replace('\t','')
if 'source' in doc.keys():
source = doc['source'].replace('"', '').replace('\t', '')
if 'abstract' in doc.keys():
abstract = doc['abstract'].replace('"', '').replace('\t', '')
if 'authors' in doc.keys() :
authors = doc['authors']
if 'authors' in doc.keys():
authors = doc['authors']
output += title + "\t" + source + "\t" + str(doc['publication_year']) + "\t" + str(
doc['publication_month']) + "\t" + str(doc['publication_day']) + "\t" + abstract + "\t" + authors + "\t" + str(1) + "\n"
output += title + "\t" + source + "\t" + str(doc['publication_year']) + "\t" + str(doc['publication_month']) + "\t" + str(doc['publication_day']) + "\t" + abstract + "\t" + authors + "\t" + str(1) + "\n"
return output
return output
st.write(st.session_state.general_text_dict['text'])
file = st.file_uploader(st.session_state.general_text_dict['file'],type=["json"],key='file')
file = st.file_uploader(
st.session_state.general_text_dict['file'], type=["json"], key='file')
if file:
try:
name = file.name.split('.')[0] + '.csv'
df = pd.read_json(file)
st.write(st.session_state.general_text_dict['new_file'])
st.download_button(name, getText(df), name)
except Exception as e:
st.write(st.session_state.general_text_dict['error'])
print(e)
file.close()
try:
name = file.name.split('.')[0] + '.csv'
df = pd.read_json(file)
st.write(st.session_state.general_text_dict['new_file'])
st.download_button(name, getText(df), name)
except Exception as e:
st.write(st.session_state.general_text_dict['error'])
file.close()
......@@ -217,7 +217,6 @@ with col1:
with col2:
st.image('img/gargantext_logo.jpg')
# Form
form = st.form('api')
......
......@@ -15,63 +15,66 @@ tmp.base("IstexToGarganText")
def read_zip(zip_file):
output=[]
output = []
dup = 0
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
for file in zip_ref.namelist():
if file.split('.')[1] != 'json' or file.split('.')[0] == 'manifest':
continue
try:
with zip_ref.open(file) as f:
data = json.load(f)
article=pd.json_normalize(data)
article = pd.json_normalize(data)
f.close()
temp={}
temp["title"]=article.get("title", '')[0].encode(encoding = 'UTF-8', errors = 'ignore').decode("utf-8").replace("\t", " ")
temp = {}
temp["title"] = article.get("title", '')[0].encode(
encoding='UTF-8', errors='ignore').decode("utf-8").replace("\t", " ")
try:
temp["abstract"]=article.get("abstract","")[0].encode(encoding = 'UTF-8', errors = 'ignore').decode("utf-8").replace("\t", " ")
temp["abstract"] = article.get("abstract", "")[0].encode(
encoding='UTF-8', errors='ignore').decode("utf-8").replace("\t", " ")
except Exception as e:
temp["abstract"] = ''
temp["abstract"] = ''
try:
authors=""
authors = ""
for author in article["author"][0]:
authors+=author["name"]+"; "
authors=authors[:-2]
authors += author["name"]+"; "
authors = authors[:-2]
except:
author = ''
temp["code"] = article.get("_id")[0]
temp["authors"]=authors.encode(encoding = 'UTF-8', errors = 'ignore').decode("utf-8").replace("\t", " ")
temp["authors"] = authors.encode(
encoding='UTF-8', errors='ignore').decode("utf-8").replace("\t", " ")
try:
temp["source"]=article.get('host.title')[0].encode(encoding = 'UTF-8', errors = 'ignore').decode("utf-8").replace("\t", " ")
temp["source"] = article.get('host.title')[0].encode(
encoding='UTF-8', errors='ignore').decode("utf-8").replace("\t", " ")
except:
temp["source"] = ''
try:
temp["publication_year"]=article["publicationDate"][0]
temp["publication_year"] = article["publicationDate"][0]
except:
temp["publication_year"]=datetime.date.today().year
temp["publication_year"]=article.get("publicationDate", datetime.date.today().year)[0]
temp["publication_month"]=1
temp["publication_day"]=1
temp["publication_year"] = datetime.date.today().year
temp["publication_year"] = article.get(
"publicationDate", datetime.date.today().year)[0]
temp["publication_month"] = 1
temp["publication_day"] = 1
output.append(temp)
except Exception as e:
dup += 1
zip_ref.close()
output=pd.DataFrame(output)
duplicated = output['title'].str.lower().replace(",", "", regex=True).duplicated()
output = pd.DataFrame(output)
duplicated = output['title'].str.lower().replace(
",", "", regex=True).duplicated()
if (duplicated.any()):
dup += duplicated.sum()
......@@ -80,18 +83,22 @@ def read_zip(zip_file):
df = pd.DataFrame(output)
return df.to_csv(index=False, sep='\t'), dup
st.write(st.session_state.general_text_dict['text'])
file = st.file_uploader(st.session_state.general_text_dict['file'],type=["zip"],key='file')
file = st.file_uploader(
st.session_state.general_text_dict['file'], type=["zip"], key='file')
if file:
try:
name = file.name.split('.')[0] + '.csv'
res, nb_dup = read_zip(file)
if nb_dup:
st.write(st.session_state.general_text_dict['dup1'] + str(nb_dup) + st.session_state.general_text_dict['dup2'])
st.write(st.session_state.general_text_dict['dup1'] + str(
nb_dup) + st.session_state.general_text_dict['dup2'])
st.write(st.session_state.general_text_dict['new_file'])
st.download_button(name, res, name)
except Exception as e:
st.write(st.session_state.general_text_dict['error'])
print(e)
file.close()
\ No newline at end of file
file.close()
......@@ -68,6 +68,7 @@ def read_file(file):
output += row
return output
st.write(st.session_state.general_text_dict['text'])
file = st.file_uploader(
st.session_state.general_text_dict['file'], type=["txt"], key='file')
......
......@@ -57,7 +57,6 @@ def read_file(file):
return output
st.write(st.session_state.general_text_dict['title'])
st.write(st.session_state.general_text_dict['text'])
file = st.file_uploader(
st.session_state.general_text_dict['file'], type=["ris"], key='file')
......@@ -70,5 +69,4 @@ if file:
st.download_button(name, read_file(file), name)
except Exception as e:
st.write(st.session_state.general_text_dict['error'])
print(e)
file.close()
......@@ -178,6 +178,7 @@ def transcriptToTsv(search, nbVideos):
bar = st.progress(count / nbVideos, dict['loading'] +
str(count) + dict['quantity'] + str(nbVideos))
for video in videos:
print(count)
bar.progress(count / nbVideos, dict['loading'] +
str(count) + dict['quantity'] + str(nbVideos))
if count == nbVideos:
......
......@@ -5,6 +5,17 @@ from st_pages import show_pages_from_config, add_indentation
def base(page):
st.markdown(
f'''
<style>
.reportview-container .sidebar-content {{
padding-top: {1}rem;
}}
.reportview-container .main .block-container {{
padding-top: {1}rem;
}}
</style>
''', unsafe_allow_html=True)
show_pages_from_config()
add_indentation()
......@@ -23,8 +34,8 @@ def base(page):
</style>
""", unsafe_allow_html=True)
# Load the language file
def load_bundle(lang):
df = pd.read_csv("lang/text_" + page + ".csv")
df = df.query(f"locale == '{lang}'")
......@@ -33,26 +44,31 @@ def base(page):
for i in range(len(df)):
tmp[df.key.to_list()[i]] = df.value.to_list()[i]
return tmp
# Load the language file
def update_lang():
st.session_state.general_text_dict = load_bundle(st.session_state.general_lang_dict[st.session_state.general_language])
st.session_state.general_text_dict = load_bundle(
st.session_state.general_lang_dict[st.session_state.general_language])
# Test if it's first connection on page or else if the last page was this one
if 'general_session_page' not in st.session_state.keys():
st.session_state.general_lang_dict = {'Français' : 'fr', 'English': 'en'}
st.session_state.general_lang_dict = {
'Français': 'fr', 'English': 'en'}
st.session_state.general_text_dict = load_bundle('fr')
st.session_state.general_language = 'Français'
st.session_state.general_session_page = page
elif st.session_state.general_session_page != page:
st.session_state.general_text_dict = load_bundle(st.session_state.general_lang_dict[st.session_state.general_language])
st.session_state.general_text_dict = load_bundle(
st.session_state.general_lang_dict[st.session_state.general_language])
st.session_state.general_session_page = page
# Delete every key who aren't fron this file
for key in st.session_state.keys():
if 'general_' not in key:
del st.session_state[key]
st.write(st.session_state.general_text_dict['title'])
# select the lang
st.selectbox('Langue', list(st.session_state.general_lang_dict.keys()), list(st.session_state.general_lang_dict.keys()).index(st.session_state.general_language),key='general_language', on_change=update_lang)
\ No newline at end of file
st.selectbox('Langue', list(st.session_state.general_lang_dict.keys()), list(st.session_state.general_lang_dict.keys(
)).index(st.session_state.general_language), key='general_language', on_change=update_lang)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment