Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
G
GarganTexternal tools
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Julien Moutinho
GarganTexternal tools
Commits
4463cc66
Commit
4463cc66
authored
Aug 10, 2023
by
Loïc Chapron
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add new pages + banner
parent
65c88912
Changes
13
Hide whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
630 additions
and
41 deletions
+630
-41
Welcome.py
Streamlit/Welcome.py
+29
-3
text_GarganTextJsonToTSV.csv
Streamlit/lang/text_GarganTextJsonToTSV.csv
+15
-0
text_HALToGarganText.csv
Streamlit/lang/text_HALToGarganText.csv
+35
-0
text_IsidoreToGarganText.csv
Streamlit/lang/text_IsidoreToGarganText.csv
+2
-2
text_IstexToGarganText.csv
Streamlit/lang/text_IstexToGarganText.csv
+17
-0
text_Welcome.csv
Streamlit/lang/text_Welcome.csv
+4
-0
text_ZoteroToGarganText.csv
Streamlit/lang/text_ZoteroToGarganText.csv
+3
-0
GarganText_Json_To_TSV.py
Streamlit/pages/GarganText_Json_To_TSV.py
+96
-0
HAL_To_GarganText.py
Streamlit/pages/HAL_To_GarganText.py
+215
-0
Isidore_To_GarganText.py
Streamlit/pages/Isidore_To_GarganText.py
+17
-7
Istex_To_GarganText.py
Streamlit/pages/Istex_To_GarganText.py
+134
-0
Merge_Term_GarganText.py
Streamlit/pages/Merge_Term_GarganText.py
+48
-21
Zotero_To_GarganText.py
Streamlit/pages/Zotero_To_GarganText.py
+15
-8
No files found.
Streamlit/Welcome.py
View file @
4463cc66
"""
Streamlit Application
Loïc Chapron
"""
import
streamlit
as
st
import
pandas
as
pd
...
...
@@ -6,6 +11,22 @@ st.set_page_config(
)
# Load Banner image
st
.
image
(
'img/gargantool_banner.jpg'
)
# Reduce size on border
st
.
markdown
(
"""
<style>
.block-container {
padding-top: 2rem;
padding-bottom: 0rem;
padding-left: 1rem;
padding-right: 1rem;
}
</style>
"""
,
unsafe_allow_html
=
True
)
# Load the language file
def
load_bundle
(
lang
):
df
=
pd
.
read_csv
(
"lang/text_Welcome.csv"
)
df
=
df
.
query
(
f
"locale == '{lang}'"
)
...
...
@@ -15,10 +36,11 @@ def load_bundle(lang):
tmp
[
df
.
key
.
to_list
()[
i
]]
=
df
.
value
.
to_list
()[
i
]
return
tmp
# Load the language file
def
update_lang
():
print
(
st
.
session_state
.
general_language
)
st
.
session_state
.
general_text_dict
=
load_bundle
(
st
.
session_state
.
general_lang_dict
[
st
.
session_state
.
general_language
])
# Test if it's first connection on page or else if the last page was this one
if
'general_session_page'
not
in
st
.
session_state
.
keys
():
st
.
session_state
.
general_lang_dict
=
{
'Français'
:
'fr'
,
'English'
:
'en'
}
st
.
session_state
.
general_text_dict
=
load_bundle
(
'fr'
)
...
...
@@ -27,12 +49,16 @@ if 'general_session_page' not in st.session_state.keys():
elif
st
.
session_state
.
general_session_page
!=
'Welcome'
:
st
.
session_state
.
general_text_dict
=
load_bundle
(
st
.
session_state
.
general_lang_dict
[
st
.
session_state
.
general_language
])
st
.
session_state
.
general_session_page
=
'Welcome'
# Delete every key who aren't fron this file
for
key
in
st
.
session_state
.
keys
():
if
'general_'
not
in
key
:
del
st
.
session_state
[
key
]
# select the lang
st
.
selectbox
(
'Langue'
,
list
(
st
.
session_state
.
general_lang_dict
.
keys
()),
list
(
st
.
session_state
.
general_lang_dict
.
keys
())
.
index
(
st
.
session_state
.
general_language
),
key
=
'general_language'
,
on_change
=
update_lang
)
st
.
write
(
"# Welcome to GanganText Tools"
)
st
.
write
(
st
.
session_state
.
general_text_dict
[
'title'
]
)
st
.
write
(
st
.
session_state
.
general_text_dict
[
'welcome'
])
st
.
write
(
st
.
session_state
.
general_text_dict
[
'tools'
])
...
...
Streamlit/lang/text_GarganTextJsonToTSV.csv
0 → 100644
View file @
4463cc66
locale,key,value
fr,title,"# Json Vers TSV"
en,title,"# Json To TSV"
fr,text,"Transforme un corpus Json venant de Gargantext en TSV pour GarganText"
en,text,"Transform a Json corpus fron GarganText to a TSV file for GarganText"
fr,file,"Choisir un fichier"
en,file,"Choose a file"
fr,new_file,"Télécharge ton fichier TSV :"
en,new_file,"Download your TSV file:"
fr,error,"Erreur : le fichier n'est pas valide"
en,error,"Error : the file isn't valid"
\ No newline at end of file
Streamlit/lang/text_HALToGarganText.csv
0 → 100644
View file @
4463cc66
locale,key,value
fr,title,"**HAL vers GarganText**"
en,title,"**HAL To GarganText**"
fr,text,"HAL est une base de document scientifique en ligne et libre d'accès contenant plus d'un million de document."
en,text,"HAL is an online and free access scientific document database containing more than a million documents"
fr,keyword,"Mots clés"
en,keyword,"Key word"
fr,lang,"Langue des textes (si possible)"
en,lang,"Text languages (if possible)"
fr,submit,"Soumettre"
en,submit,"Submit"
fr,load_api,"Chargement de l'api..."
en,load_api,"Loading API..."
fr,overload_api,"L'API est surchargé, relancer la requête dans quelques secondes"
en,overload'api,"The API is overloaded, please retry the request in a few seconds"
fr,nb_doc,"Nombres de documents : "
en,nb_doc,"Numbers of documents : "
fr,perform1,"Pour des raisons de performence, on limit à "
fr,perform2," le nombre de document maximum"
en,perform1,"For performance reasons, we limit to "
en,perform2," the maximum number of documents"
fr,nb_taken,"Nombres de documents à prendre"
en,nb_taken,"Number of documents to take"
fr,createTSV,"Création du fichier TSV (Cela peut prendre quelque minutes)"
en,createTSV,"Creation of the TSV file (It may take a while)"
Streamlit/lang/text_IsidoreToGarganText.csv
View file @
4463cc66
...
...
@@ -5,8 +5,8 @@ en,title,"**Isidore To GarganText**"
fr,keyword,"Mots clés"
en,keyword,"Key word"
fr,lang,"Langue"
en,lang,"
Language
"
fr,lang,"Langue
des textes (si possible)
"
en,lang,"
Text languages (if possible)
"
fr,submit,"Soumettre"
en,submit,"Submit"
...
...
Streamlit/lang/text_IstexToGarganText.csv
0 → 100644
View file @
4463cc66
locale,key,value
fr,title,"# Istex Vers GarganText"
en,title,"# Istex To GarganText"
fr,file,"Choisir un fichier"
en,file,"Choose a file"
fr,dup1,"Certains fichiers ("
fr,dup2,") ont été retirés pour divers raisons (fichier au mauvais format, fichier identique...)"
en,dup1,"Some file ("
en,dup2,") have been removed for various reasons (file with wrong format, file already present...)"
fr,new_file,"Télécharge ton fichier TSV :"
en,new_file,"Download your TSV file:"
fr,error,"Erreur : le fichier n'est pas valide"
en,error,"Error : the file isn't valid"
\ No newline at end of file
Streamlit/lang/text_Welcome.csv
View file @
4463cc66
locale,key,value
fr,title,"# Bienvenue sur GanganText Tools"
en,title,"# Welcome to GanganText Tools"
fr,welcome,"Bienvenue sur ces pages rassemblant des outils développés par des utilisateurs de GarganText pour des utilisateurs de GarganText."
en,welcome,"Welcome to these pages featuring tools developed by GarganText’ users for GarganText’ users."
...
...
Streamlit/lang/text_ZoteroToGarganText.csv
View file @
4463cc66
...
...
@@ -5,6 +5,9 @@ en,title,"**Zotero vers GarganText**"
fr,data,"Type de donnée"
en,data,"Type of data"
fr,help,"Trouvé votre ID d'utilisateur ici: https://www.zotero.org/settings/keys"
en,help,"Find your user ID here: https://www.zotero.org/settings/keys"
fr,submit,"Suivant"
en,submit,"Submit"
...
...
Streamlit/pages/GarganText_Json_To_TSV.py
0 → 100644
View file @
4463cc66
"""
Streamlit Application
Loïc Chapron
"""
import
streamlit
as
st
import
pandas
as
pd
st
.
image
(
'img/gargantool_banner.jpg'
)
st
.
markdown
(
"""
<style>
.block-container {
padding-top: 2rem;
padding-bottom: 0rem;
padding-left: 1rem;
padding-right: 1rem;
}
</style>
"""
,
unsafe_allow_html
=
True
)
def
load_bundle
(
lang
):
df
=
pd
.
read_csv
(
"lang/text_GarganTextJsonToTSV.csv"
)
df
=
df
.
query
(
f
"locale == '{lang}'"
)
tmp
=
{}
for
i
in
range
(
len
(
df
)):
tmp
[
df
.
key
.
to_list
()[
i
]]
=
df
.
value
.
to_list
()[
i
]
return
tmp
def
update_lang
():
st
.
session_state
.
general_text_dict
=
load_bundle
(
st
.
session_state
.
general_lang_dict
[
st
.
session_state
.
general_language
])
if
'general_session_page'
not
in
st
.
session_state
.
keys
():
st
.
session_state
.
general_lang_dict
=
{
'Français'
:
'fr'
,
'English'
:
'en'
}
st
.
session_state
.
general_text_dict
=
load_bundle
(
'fr'
)
st
.
session_state
.
general_language
=
'Français'
st
.
session_state
.
general_session_page
=
'GarganTextJsonToTSV'
elif
st
.
session_state
.
general_session_page
!=
'GarganTextJsonToTSV'
:
st
.
session_state
.
general_text_dict
=
load_bundle
(
st
.
session_state
.
general_lang_dict
[
st
.
session_state
.
general_language
])
st
.
session_state
.
general_session_page
=
'GarganTextJsonToTSV'
for
key
in
st
.
session_state
.
keys
():
if
'general_'
not
in
key
:
del
st
.
session_state
[
key
]
st
.
selectbox
(
'Langue'
,
list
(
st
.
session_state
.
general_lang_dict
.
keys
()),
list
(
st
.
session_state
.
general_lang_dict
.
keys
())
.
index
(
st
.
session_state
.
general_language
),
key
=
'general_language'
,
on_change
=
update_lang
)
def
getText
(
corpusJson
):
output
=
"title
\t
source
\t
publication_year
\t
publication_month
\t
publication_day
\t
abstract
\t
authors
\t
weight
\n
"
for
row
in
corpusJson
[
'corpus'
]
:
doc
=
row
[
'document'
][
'hyperdata'
]
abstract
=
"empty"
authors
=
"empty"
title
=
"empty"
source
=
"empty"
if
'title'
in
doc
.
keys
()
:
title
=
doc
[
'title'
]
.
replace
(
'"'
,
''
)
.
replace
(
'
\t
'
,
''
)
if
'source'
in
doc
.
keys
()
:
source
=
doc
[
'source'
]
.
replace
(
'"'
,
''
)
.
replace
(
'
\t
'
,
''
)
if
'abstract'
in
doc
.
keys
()
:
abstract
=
doc
[
'abstract'
]
.
replace
(
'"'
,
''
)
.
replace
(
'
\t
'
,
''
)
if
'authors'
in
doc
.
keys
()
:
authors
=
doc
[
'authors'
]
output
+=
title
+
"
\t
"
+
source
+
"
\t
"
+
str
(
doc
[
'publication_year'
])
+
"
\t
"
+
str
(
doc
[
'publication_month'
])
+
"
\t
"
+
str
(
doc
[
'publication_day'
])
+
"
\t
"
+
abstract
+
"
\t
"
+
authors
+
"
\t
"
+
str
(
1
)
+
"
\n
"
return
output
st
.
write
(
st
.
session_state
.
general_text_dict
[
'title'
])
st
.
write
(
st
.
session_state
.
general_text_dict
[
'text'
])
file
=
st
.
file_uploader
(
st
.
session_state
.
general_text_dict
[
'file'
],
type
=
[
"json"
],
key
=
'file'
)
if
file
:
try
:
name
=
file
.
name
.
split
(
'.'
)[
0
]
+
'.csv'
df
=
pd
.
read_json
(
file
)
st
.
write
(
st
.
session_state
.
general_text_dict
[
'new_file'
])
st
.
download_button
(
name
,
getText
(
df
),
name
)
except
Exception
as
e
:
st
.
write
(
st
.
session_state
.
general_text_dict
[
'error'
])
print
(
e
)
file
.
close
()
Streamlit/pages/HAL_To_GarganText.py
0 → 100644
View file @
4463cc66
"""
Streamlit Application
Loïc Chapron
"""
import
requests
as
req
import
json
import
streamlit
as
st
from
datetime
import
datetime
import
pandas
as
pd
limit
=
500
limitItems
=
10000
st
.
image
(
'img/gargantool_banner.jpg'
)
st
.
markdown
(
"""
<style>
.block-container {
padding-top: 2rem;
padding-bottom: 0rem;
padding-left: 1rem;
padding-right: 1rem;
}
</style>
"""
,
unsafe_allow_html
=
True
)
def
load_bundle
(
lang
):
df
=
pd
.
read_csv
(
"lang/text_HALToGarganText.csv"
)
df
=
df
.
query
(
f
"locale == '{lang}'"
)
tmp
=
{}
for
i
in
range
(
len
(
df
)):
tmp
[
df
.
key
.
to_list
()[
i
]]
=
df
.
value
.
to_list
()[
i
]
return
tmp
def
update_lang
():
st
.
session_state
.
general_text_dict
=
load_bundle
(
st
.
session_state
.
general_lang_dict
[
st
.
session_state
.
general_language
])
if
'general_session_page'
not
in
st
.
session_state
.
keys
():
st
.
session_state
.
general_lang_dict
=
{
'Français'
:
'fr'
,
'English'
:
'en'
}
st
.
session_state
.
general_text_dict
=
load_bundle
(
'fr'
)
st
.
session_state
.
general_language
=
'Français'
st
.
session_state
.
general_session_page
=
'HALToGarganText'
elif
st
.
session_state
.
general_session_page
!=
'HALToGarganText'
:
st
.
session_state
.
general_text_dict
=
load_bundle
(
st
.
session_state
.
general_lang_dict
[
st
.
session_state
.
general_language
])
st
.
session_state
.
general_session_page
=
'HALToGarganText'
for
key
in
st
.
session_state
.
keys
():
if
'general_'
not
in
key
:
del
st
.
session_state
[
key
]
st
.
selectbox
(
'Langue'
,
list
(
st
.
session_state
.
general_lang_dict
.
keys
()),
list
(
st
.
session_state
.
general_lang_dict
.
keys
())
.
index
(
st
.
session_state
.
general_language
),
key
=
'general_language'
,
on_change
=
update_lang
)
def
loadApiHALNbFile
(
search
,
lang
):
url
=
'http://api.archives-ouvertes.fr/search/?q='
+
search
+
'&rows=5&fl=title_s,'
+
lang
+
'_title_s,source_s,publicationDate_s,authFullName_s,'
+
lang
+
'_abstract_s,abstract_s&fq=language_s:'
+
lang
resp
=
req
.
get
(
url
)
print
(
url
)
try
:
docs
=
json
.
loads
(
resp
.
content
)
except
Exception
as
e
:
docs
=
0
return
docs
[
'response'
][
'numFound'
]
def
loadApiHAL
(
search
,
lang
,
page
,
nbvalue
):
url
=
'http://api.archives-ouvertes.fr/search/?q='
+
search
+
'&start='
+
str
(
page
*
limit
)
+
'&rows='
+
str
(
nbvalue
)
+
'&fl=title_s,'
+
lang
+
'_title_s,source_s,publicationDate_s,authFullName_s,'
+
lang
+
'_abstract_s,abstract_s&fq=language_s:'
+
lang
resp
=
req
.
get
(
url
)
print
(
url
)
try
:
docs
=
json
.
loads
(
resp
.
content
)
except
Exception
as
e
:
docs
=
0
return
docs
def
getParamFromDoc
(
docs
):
output
=
''
for
doc
in
docs
:
if
'en_title_s'
in
doc
.
keys
():
title
=
doc
[
'en_title_s'
][
0
]
elif
'fr_title_s'
in
doc
.
keys
():
title
=
doc
[
'fr_title_s'
][
0
]
elif
'title_s'
in
doc
.
keys
():
title
=
doc
[
'title_s'
][
0
]
else
:
continue
# Authors
if
'authFullName_s'
in
doc
.
keys
():
authors
=
[]
for
author
in
doc
[
'authFullName_s'
]:
authors
.
append
(
author
)
authors
=
';'
.
join
(
authors
)
else
:
authors
=
''
# Source
if
'source_s'
in
doc
.
keys
():
source
=
doc
[
'source_s'
]
else
:
source
=
''
# Abstract
if
'en_abstract_s'
in
doc
.
keys
():
abstract
=
doc
[
'en_abstract_s'
][
0
]
elif
'fr_abstract_s'
in
doc
.
keys
():
abstract
=
doc
[
'fr_abstract_s'
][
0
]
elif
'abstract_s'
in
doc
.
keys
():
abstract
=
doc
[
'abstract_s'
][
0
]
else
:
abstract
=
''
# Date
if
'publicationDate_s'
in
doc
.
keys
():
split
=
doc
[
'publicationDate_s'
]
.
split
(
'-'
)
if
len
(
split
)
==
3
:
pdate
=
datetime
.
strptime
(
doc
[
'publicationDate_s'
],
'
%
Y-
%
m-
%
d'
)
.
strftime
(
'
%
Y
\t
%
m
\t
%
d'
)
elif
len
(
split
)
==
2
:
pdate
=
datetime
.
strptime
(
doc
[
'publicationDate_s'
],
'
%
Y-
%
m'
)
.
strftime
(
'
%
Y
\t
%
m
\t
1'
)
else
:
pdate
=
doc
[
'publicationDate_s'
]
+
'
\t
1
\t
1'
else
:
pdate
=
'1900
\t
1
\t
1'
abstract
=
abstract
.
encode
(
encoding
=
'UTF-8'
,
errors
=
'ignore'
)
.
decode
(
"utf-8"
)
.
replace
(
'
\t
'
,
''
)
.
replace
(
'"'
,
''
)
.
replace
(
'
\n
'
,
''
)
title
=
title
.
encode
(
encoding
=
'UTF-8'
,
errors
=
'ignore'
)
.
decode
(
"utf-8"
)
.
replace
(
'
\t
'
,
''
)
.
replace
(
'"'
,
''
)
.
replace
(
'
\n
'
,
''
)
source
=
source
.
encode
(
encoding
=
'UTF-8'
,
errors
=
'ignore'
)
.
decode
(
"utf-8"
)
.
replace
(
'
\t
'
,
''
)
.
replace
(
'"'
,
''
)
.
replace
(
'
\n
'
,
''
)
.
replace
(
'
\n
'
,
''
)
# Output
output
+=
str
(
title
)
+
"
\t
"
+
source
+
"
\t
"
+
str
(
pdate
)
+
"
\t
"
+
abstract
+
"
\t
"
+
authors
+
"
\t
"
+
str
(
1
)
+
"
\n
"
return
output
def
create_output
(
search
,
lang
,
nb_value
):
output
=
"title
\t
source
\t
publication_year
\t
publication_month
\t
publication_day
\t
abstract
\t
authors
\t
weight
\n
"
for
i
in
range
(
0
,
nb_value
//
limit
):
response
=
loadApiHAL
(
search
,
lang
,
i
,
limit
)
output
+=
getParamFromDoc
(
response
[
'response'
][
'docs'
])
if
(
nb_value
%
limit
!=
0
):
response
=
loadApiHAL
(
search
,
lang
,
nb_value
//
limit
,
nb_value
%
limit
)
output
+=
getParamFromDoc
(
response
[
'response'
][
'docs'
])
return
output
lang
=
{
'Français'
:
'fr'
,
'Anglais'
:
'en'
,
}
if
'stage_isidore'
not
in
st
.
session_state
:
st
.
session_state
.
stage_isidore
=
0
st
.
session_state
.
nb_wanted
=
1
def
set_stage
(
stage
):
st
.
session_state
.
stage_isidore
=
stage
st
.
session_state
.
output
=
''
# txt
st
.
write
(
st
.
session_state
.
general_text_dict
[
'text'
])
# Form
form
=
st
.
form
(
'api'
)
form
.
write
(
st
.
session_state
.
general_text_dict
[
'title'
])
search
=
form
.
text_input
(
st
.
session_state
.
general_text_dict
[
'keyword'
])
language
=
form
.
selectbox
(
st
.
session_state
.
general_text_dict
[
'lang'
],
lang
.
keys
())
form
.
form_submit_button
(
st
.
session_state
.
general_text_dict
[
'submit'
],
on_click
=
set_stage
,
args
=
(
1
,))
# API and Slider
if
st
.
session_state
.
stage_isidore
>
0
:
# Only call first time and after
if
'search'
not
in
st
.
session_state
or
'language'
not
in
st
.
session_state
or
search
!=
st
.
session_state
.
search
or
language
!=
st
.
session_state
.
language
:
with
st
.
spinner
(
st
.
session_state
.
general_text_dict
[
'load_api'
]
):
nb_doc
=
int
(
loadApiHALNbFile
(
search
,
lang
[
language
]))
st
.
session_state
.
nb_doc
=
nb_doc
if
nb_doc
!=
0
:
st
.
session_state
.
search
=
search
st
.
session_state
.
language
=
language
if
st
.
session_state
.
nb_doc
!=
0
:
# Form with slider
form2
=
st
.
form
(
'my_form2'
)
form2
.
write
(
st
.
session_state
.
general_text_dict
[
'nb_doc'
]
+
str
(
st
.
session_state
.
nb_doc
))
if
st
.
session_state
.
nb_doc
>
limitItems
:
form2
.
write
(
st
.
session_state
.
general_text_dict
[
'perform1'
]
+
str
(
limitItems
)
+
st
.
session_state
.
general_text_dict
[
'perform2'
])
st
.
session_state
.
nb_wanted
=
form2
.
slider
(
st
.
session_state
.
general_text_dict
[
'nb_taken'
],
1
,
limitItems
)
else
:
st
.
session_state
.
nb_wanted
=
form2
.
slider
(
st
.
session_state
.
general_text_dict
[
'nb_taken'
],
1
,
int
(
st
.
session_state
.
nb_doc
))
form2
.
form_submit_button
(
st
.
session_state
.
general_text_dict
[
'submit'
],
on_click
=
set_stage
,
args
=
(
2
,))
else
:
st
.
write
(
st
.
session_state
.
general_text_dict
[
'overload_api'
]
)
# Download
if
st
.
session_state
.
stage_isidore
>
1
:
with
st
.
spinner
(
st
.
session_state
.
general_text_dict
[
'createTSV'
]
):
if
st
.
session_state
.
output
==
''
:
print
(
st
.
session_state
.
nb_wanted
)
st
.
session_state
.
output
=
create_output
(
st
.
session_state
.
search
,
lang
[
st
.
session_state
.
language
],
st
.
session_state
.
nb_wanted
)
st
.
download_button
(
'Download TSV'
,
st
.
session_state
.
output
,
'output.csv'
)
Streamlit/pages/Isidore_To_GarganText.py
View file @
4463cc66
...
...
@@ -3,20 +3,25 @@ Streamlit Application
Loïc Chapron
"""
#streamlit run Isidore_To_GarganText.py
import
streamlit
as
st
import
requests
as
req
import
json
import
pandas
as
pd
import
time
keys
=
[
'search'
,
'language'
,
'stage_isidore'
,
'output'
,
'nb_doc'
,
'nb_wanted'
,
'nb_bad_file'
]
for
key
in
st
.
session_state
.
keys
():
if
key
not
in
keys
and
'general_'
not
in
key
:
del
st
.
session_state
[
key
]
st
.
image
(
'img/gargantool_banner.jpg'
)
st
.
markdown
(
"""
<style>
.block-container {
padding-top: 2rem;
padding-bottom: 0rem;
padding-left: 1rem;
padding-right: 1rem;
}
</style>
"""
,
unsafe_allow_html
=
True
)
def
load_bundle
(
lang
):
df
=
pd
.
read_csv
(
"lang/text_IsidoreToGarganText.csv"
)
...
...
@@ -38,6 +43,9 @@ if 'general_session_page' not in st.session_state.keys():
elif
st
.
session_state
.
general_session_page
!=
'IsidoreToGarganText'
:
st
.
session_state
.
general_text_dict
=
load_bundle
(
st
.
session_state
.
general_lang_dict
[
st
.
session_state
.
general_language
])
st
.
session_state
.
general_session_page
=
'IsidoreToGarganText'
for
key
in
st
.
session_state
.
keys
():
if
'general_'
not
in
key
:
del
st
.
session_state
[
key
]
st
.
selectbox
(
'Langue'
,
list
(
st
.
session_state
.
general_lang_dict
.
keys
()),
list
(
st
.
session_state
.
general_lang_dict
.
keys
())
.
index
(
st
.
session_state
.
general_language
),
key
=
'general_language'
,
on_change
=
update_lang
)
...
...
@@ -84,7 +92,9 @@ def create_output(search, language, nb_doc):
break
time
.
sleep
(
retryTime
)
print
(
'Retry'
)
output
+=
createFile
(
txt
,
numberReplies
,
language
)
tmp
,
nb_tmp
=
createFile
(
txt
,
nb_doc
%
numberReplies
,
language
)
output
+=
tmp
nb
+=
nb_tmp
if
nb_doc
%
numberReplies
!=
0
:
txt
=
loadApiIsidorePage
(
search
,
language
,
nb_doc
//
numberReplies
+
1
)
tmp
,
nb_tmp
=
createFile
(
txt
,
nb_doc
%
numberReplies
,
language
)
...
...
Streamlit/pages/Istex_To_GarganText.py
0 → 100644
View file @
4463cc66
"""
Streamlit Application
Loïc Chapron
"""
import
json
import
pandas
as
pd
import
datetime
import
zipfile
import
streamlit
as
st
st
.
image
(
'img/gargantool_banner.jpg'
)
st
.
markdown
(
"""
<style>
.block-container {
padding-top: 2rem;
padding-bottom: 0rem;
padding-left: 1rem;
padding-right: 1rem;
}
</style>
"""
,
unsafe_allow_html
=
True
)
def
load_bundle
(
lang
):
df
=
pd
.
read_csv
(
"lang/text_IstexToGarganText.csv"
)
df
=
df
.
query
(
f
"locale == '{lang}'"
)
tmp
=
{}
for
i
in
range
(
len
(
df
)):
tmp
[
df
.
key
.
to_list
()[
i
]]
=
df
.
value
.
to_list
()[
i
]
return
tmp
def
update_lang
():
st
.
session_state
.
general_text_dict
=
load_bundle
(
st
.
session_state
.
general_lang_dict
[
st
.
session_state
.
general_language
])
if
'general_session_page'
not
in
st
.
session_state
.
keys
():
st
.
session_state
.
general_lang_dict
=
{
'Français'
:
'fr'
,
'English'
:
'en'
}
st
.
session_state
.
general_text_dict
=
load_bundle
(
'fr'
)
st
.
session_state
.
general_language
=
'Français'
st
.
session_state
.
general_session_page
=
'IstexToGarganText'
elif
st
.
session_state
.
general_session_page
!=
'IstexToGarganText'
:
st
.
session_state
.
general_text_dict
=
load_bundle
(
st
.
session_state
.
general_lang_dict
[
st
.
session_state
.
general_language
])
st
.
session_state
.
general_session_page
=
'IstexToGarganText'
for
key
in
st
.
session_state
.
keys
():
if
'general_'
not
in
key
:
del
st
.
session_state
[
key
]
st
.
selectbox
(
'Langue'
,
list
(
st
.
session_state
.
general_lang_dict
.
keys
()),
list
(
st
.
session_state
.
general_lang_dict
.
keys
())
.
index
(
st
.
session_state
.
general_language
),
key
=
'general_language'
,
on_change
=
update_lang
)
def
read_zip
(
zip_file
):
output
=
[]
dup
=
0
with
zipfile
.
ZipFile
(
zip_file
,
'r'
)
as
zip_ref
:
for
file
in
zip_ref
.
namelist
():
if
file
.
split
(
'.'
)[
1
]
!=
'json'
or
file
.
split
(
'.'
)[
0
]
==
'manifest'
:
continue
try
:
with
zip_ref
.
open
(
file
)
as
f
:
data
=
json
.
load
(
f
)
article
=
pd
.
json_normalize
(
data
)
f
.
close
()
temp
=
{}
temp
[
"title"
]
=
article
.
get
(
"title"
,
''
)[
0
]
.
encode
(
encoding
=
'UTF-8'
,
errors
=
'ignore'
)
.
decode
(
"utf-8"
)
.
replace
(
"
\t
"
,
" "
)
try
:
temp
[
"abstract"
]
=
article
.
get
(
"abstract"
,
""
)[
0
]
.
encode
(
encoding
=
'UTF-8'
,
errors
=
'ignore'
)
.
decode
(
"utf-8"
)
.
replace
(
"
\t
"
,
" "
)
except
Exception
as
e
:
temp
[
"abstract"
]
=
''
try
:
authors
=
""
for
author
in
article
[
"author"
][
0
]:
authors
+=
author
[
"name"
]
+
"; "
authors
=
authors
[:
-
2
]
except
:
author
=
''
temp
[
"code"
]
=
article
.
get
(
"_id"
)[
0
]
temp
[
"authors"
]
=
authors
.
encode
(
encoding
=
'UTF-8'
,
errors
=
'ignore'
)
.
decode
(
"utf-8"
)
.
replace
(
"
\t
"
,
" "
)
try
:
temp
[
"source"
]
=
article
.
get
(
'host.title'
)[
0
]
.
encode
(
encoding
=
'UTF-8'
,
errors
=
'ignore'
)
.
decode
(
"utf-8"
)
.
replace
(
"
\t
"
,
" "
)
except
:
temp
[
"source"
]
=
''
try
:
temp
[
"publication_year"
]
=
article
[
"publicationDate"
][
0
]
except
:
temp
[
"publication_year"
]
=
datetime
.
date
.
today
()
.
year
temp
[
"publication_year"
]
=
article
.
get
(
"publicationDate"
,
datetime
.
date
.
today
()
.
year
)[
0
]
temp
[
"publication_month"
]
=
1
temp
[
"publication_day"
]
=
1
output
.
append
(
temp
)
except
Exception
as
e
:
dup
+=
1
zip_ref
.
close
()
output
=
pd
.
DataFrame
(
output
)
duplicated
=
output
[
'title'
]
.
str
.
lower
()
.
replace
(
","
,
""
,
regex
=
True
)
.
duplicated
()
if
(
duplicated
.
any
()):
dup
+=
duplicated
.
sum
()
output
.
drop
([
'code'
],
axis
=
1
)
output
=
output
[
~
duplicated
]
df
=
pd
.
DataFrame
(
output
)
return
df
.
to_csv
(
index
=
False
,
sep
=
'
\t
'
),
dup
st
.
write
(
st
.
session_state
.
general_text_dict
[
'title'
])
file
=
st
.
file_uploader
(
st
.
session_state
.
general_text_dict
[
'file'
],
type
=
[
"zip"
],
key
=
'file'
)
if
file
:
try
:
name
=
file
.
name
.
split
(
'.'
)[
0
]
+
'.csv'
res
,
nb_dup
=
read_zip
(
file
)
if
nb_dup
:
st
.
write
(
st
.
session_state
.
general_text_dict
[
'dup1'
]
+
str
(
nb_dup
)
+
st
.
session_state
.
general_text_dict
[
'dup2'
])
st
.
write
(
st
.
session_state
.
general_text_dict
[
'new_file'
])
st
.
download_button
(
name
,
res
,
name
)
except
Exception
as
e
:
st
.
write
(
st
.
session_state
.
general_text_dict
[
'error'
])
print
(
e
)
file
.
close
()
\ No newline at end of file
Streamlit/pages/Merge_Term_GarganText.py
View file @
4463cc66
import
json
"""
Streamlit Application
Loïc Chapron
"""
import
streamlit
as
st
from
datetime
import
datetime
import
pandas
as
pd
for
key
in
st
.
session_state
.
keys
():
if
'general_'
not
in
key
:
del
st
.
session_state
[
key
]
st
.
image
(
'img/gargantool_banner.jpg'
)
st
.
markdown
(
"""
<style>
.block-container {
padding-top: 2rem;
padding-bottom: 0rem;
padding-left: 1rem;
padding-right: 1rem;
}
</style>
"""
,
unsafe_allow_html
=
True
)
def
load_bundle
(
lang
):
df
=
pd
.
read_csv
(
"lang/text_MergeTermGarganText.csv"
)
...
...
@@ -28,6 +41,9 @@ if 'general_session_page' not in st.session_state.keys():
elif
st
.
session_state
.
general_session_page
!=
'MergeTermGarganText'
:
st
.
session_state
.
general_text_dict
=
load_bundle
(
st
.
session_state
.
general_lang_dict
[
st
.
session_state
.
general_language
])
st
.
session_state
.
general_session_page
=
'MergeTermGarganText'
for
key
in
st
.
session_state
.
keys
():
if
'general_'
not
in
key
:
del
st
.
session_state
[
key
]
st
.
selectbox
(
'Langue'
,
list
(
st
.
session_state
.
general_lang_dict
.
keys
()),
list
(
st
.
session_state
.
general_lang_dict
.
keys
())
.
index
(
st
.
session_state
.
general_language
),
key
=
'general_language'
,
on_change
=
update_lang
)
...
...
@@ -36,11 +52,8 @@ st.selectbox('Langue', list(st.session_state.general_lang_dict.keys()), list(st.
def
tmp
(
file1
,
file2
):
listJson1
=
json
.
load
(
file1
)
listJson2
=
json
.
load
(
file2
)
ngrams1
=
listJson1
[
'NgramsTerms'
][
'data'
]
ngrams2
=
listJson2
[
'NgramsTerms'
][
'data'
]
ngrams1
=
file1
[
'NgramsTerms'
][
'data'
]
ngrams2
=
file2
[
'NgramsTerms'
][
'data'
]
merged
=
{}
roots
=
[]
...
...
@@ -85,22 +98,36 @@ def tmp(file1, file2):
merged
[
root
]
=
ngrams1
[
root
]
listJson1
[
'NgramsTerms'
][
'data'
]
=
merged
listJson1
[
'Authors'
][
'data'
]
=
{}
listJson1
[
'Institutes'
][
'data'
]
=
{}
listJson1
[
'Sources'
][
'data'
]
=
{}
return
json
.
dumps
(
listJson1
,
sort_keys
=
False
,
indent
=
4
)
file1
[
'NgramsTerms'
][
'data'
]
=
merged
file1
[
'Authors'
][
'data'
]
=
{}
file1
[
'Institutes'
][
'data'
]
=
{}
file1
[
'Sources'
][
'data'
]
=
{}
tmp
=
file1
.
to_json
(
orient
=
'columns'
,
indent
=
4
)
return
tmp
st
.
subheader
(
st
.
session_state
.
general_text_dict
[
'title'
])
col1
,
col2
=
st
.
columns
(
2
)
with
col1
:
file1
=
st
.
file_uploader
(
st
.
session_state
.
general_text_dict
[
'file'
],
'json'
,
key
=
'file1'
)
file1
=
st
.
file_uploader
(
st
.
session_state
.
general_text_dict
[
'file'
],
type
=
[
"json"
]
,
key
=
'file1'
)
with
col2
:
file2
=
st
.
file_uploader
(
st
.
session_state
.
general_text_dict
[
'file'
],
'json'
,
key
=
'file2'
)
file2
=
st
.
file_uploader
(
st
.
session_state
.
general_text_dict
[
'file'
],
type
=
[
"json"
],
key
=
'file2'
)
if
(
file1
and
file2
):
st
.
write
(
st
.
session_state
.
general_text_dict
[
'new_file'
])
time
=
datetime
.
strftime
(
datetime
.
now
(),
"
%
d-
%
m-
%
Y/
%
H:
%
M:
%
S"
)
name
=
'output-'
+
time
+
'.json'
st
.
download_button
(
'Download File'
,
tmp
(
file1
,
file2
),
name
)
\ No newline at end of file
try
:
df1
=
pd
.
read_json
(
file1
)
df2
=
pd
.
read_json
(
file2
)
st
.
write
(
st
.
session_state
.
general_text_dict
[
'new_file'
])
time
=
datetime
.
strftime
(
datetime
.
now
(),
"
%
d-
%
m-
%
Y/
%
H:
%
M:
%
S"
)
name
=
'output-'
+
time
+
'.json'
st
.
download_button
(
'Download File'
,
tmp
(
df1
,
df2
),
name
)
except
Exception
as
e
:
st
.
write
(
"Error : one of the file isn't valid"
)
file1
.
close
()
file2
.
close
()
\ No newline at end of file
Streamlit/pages/Zotero_To_GarganText.py
View file @
4463cc66
...
...
@@ -3,8 +3,6 @@ Streamlit Application
Loïc Chapron
"""
#streamlit run zotero2ggt.py
import
streamlit
as
st
import
requests
as
req
import
json
...
...
@@ -16,13 +14,19 @@ limit = 50 # This value cannot exceed 100 !
st
.
set_page_config
(
page_title
=
"Zotero to GarganText"
)
keys
=
[
'id'
,
'page'
,
'docs'
,
'docsByKey'
,
'result'
,
'stage'
,
'collectionsKey'
,
'format'
,
'zotero_search'
,
'nbdoc'
,
'select'
]
st
.
image
(
'img/gargantool_banner.jpg'
)
for
key
in
st
.
session_state
.
keys
():
if
key
not
in
keys
and
'general_'
not
in
key
:
del
st
.
session_state
[
key
]
st
.
markdown
(
"""
<style>
.block-container {
padding-top: 2rem;
padding-bottom: 0rem;
padding-left: 1rem;
padding-right: 1rem;
}
</style>
"""
,
unsafe_allow_html
=
True
)
@
st
.
cache_data
def
load_bundle
(
lang
):
df
=
pd
.
read_csv
(
"lang/text_ZoteroToGarganText.csv"
)
df
=
df
.
query
(
f
"locale == '{lang}'"
)
...
...
@@ -43,6 +47,9 @@ if 'general_session_page' not in st.session_state.keys():
elif
st
.
session_state
.
general_session_page
!=
'ZoteroToGarganText'
:
st
.
session_state
.
general_text_dict
=
load_bundle
(
st
.
session_state
.
general_lang_dict
[
st
.
session_state
.
general_language
])
st
.
session_state
.
general_session_page
=
'ZoteroToGarganText'
for
key
in
st
.
session_state
.
keys
():
if
'general_'
not
in
key
:
del
st
.
session_state
[
key
]
st
.
selectbox
(
'Langue'
,
list
(
st
.
session_state
.
general_lang_dict
.
keys
()),
list
(
st
.
session_state
.
general_lang_dict
.
keys
())
.
index
(
st
.
session_state
.
general_language
),
key
=
'general_language'
,
on_change
=
update_lang
)
...
...
@@ -248,7 +255,7 @@ if st.session_state.stage == 0:
form
.
write
(
st
.
session_state
.
general_text_dict
[
'title'
])
lst
=
[
'items'
,
'collections'
]
st
.
session_state
.
id
=
form
.
text_input
(
'ID'
,
st
.
session_state
.
id
,
key
=
'idForm'
,
help
=
"Trouvé votre ID d'utilisateur ici: https://www.zotero.org/settings/keys"
)
st
.
session_state
.
id
=
form
.
text_input
(
'ID'
,
st
.
session_state
.
id
,
key
=
'idForm'
,
help
=
st
.
session_state
.
general_text_dict
[
'help'
]
)
st
.
session_state
.
format
=
form
.
selectbox
(
st
.
session_state
.
general_text_dict
[
'data'
],
lst
,
lst
.
index
(
st
.
session_state
.
format
),
key
=
'formatForm'
)
form
.
form_submit_button
(
st
.
session_state
.
general_text_dict
[
'submit'
],
on_click
=
set_stage
,
args
=
(
1
,))
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment