Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
G
GarganTexternal tools
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Julien Moutinho
GarganTexternal tools
Commits
cf004a85
Commit
cf004a85
authored
Sep 06, 2023
by
Atrax Nicolas
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Update pages
parent
879a0266
Changes
7
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
93 additions
and
72 deletions
+93
-72
GarganText_Json_To_TSV.py
Streamlit/pages/GarganText_Json_To_TSV.py
+30
-31
Isidore_To_GarganText.py
Streamlit/pages/Isidore_To_GarganText.py
+0
-1
Istex_To_GarganText.py
Streamlit/pages/Istex_To_GarganText.py
+37
-30
Pubmed_To_GarganText.py
Streamlit/pages/Pubmed_To_GarganText.py
+1
-0
Ris_To_GarganText.py
Streamlit/pages/Ris_To_GarganText.py
+0
-2
YTB_to_TSV.py
Streamlit/pages/YTB_to_TSV.py
+1
-0
basic.py
Streamlit/src/basic.py
+24
-8
No files found.
Streamlit/pages/GarganText_Json_To_TSV.py
View file @
cf004a85
...
...
@@ -10,38 +10,39 @@ import src.basic as tmp
tmp
.
base
(
"GarganTextJsonToTSV"
)
def
getText
(
corpusJson
):
output
=
"title
\t
source
\t
publication_year
\t
publication_month
\t
publication_day
\t
abstract
\t
authors
\t
weight
\n
"
for
row
in
corpusJson
[
'corpus'
]
:
for
row
in
corpusJson
[
'corpus'
]:
doc
=
row
[
'document'
][
'hyperdata'
]
abstract
=
"empty"
authors
=
"empty"
title
=
"empty"
source
=
"empty"
if
'title'
in
doc
.
keys
()
:
title
=
doc
[
'title'
]
.
replace
(
'"'
,
''
)
.
replace
(
'
\t
'
,
''
)
if
'source'
in
doc
.
keys
()
:
source
=
doc
[
'source'
]
.
replace
(
'"'
,
''
)
.
replace
(
'
\t
'
,
''
)
if
'title'
in
doc
.
keys
():
title
=
doc
[
'title'
]
.
replace
(
'"'
,
''
)
.
replace
(
'
\t
'
,
''
)
if
'abstract'
in
doc
.
keys
()
:
abstract
=
doc
[
'abstract'
]
.
replace
(
'"'
,
''
)
.
replace
(
'
\t
'
,
''
)
if
'source'
in
doc
.
keys
()
:
source
=
doc
[
'source'
]
.
replace
(
'"'
,
''
)
.
replace
(
'
\t
'
,
''
)
if
'abstract'
in
doc
.
keys
():
abstract
=
doc
[
'abstract'
]
.
replace
(
'"'
,
''
)
.
replace
(
'
\t
'
,
''
)
if
'authors'
in
doc
.
keys
()
:
if
'authors'
in
doc
.
keys
()
:
authors
=
doc
[
'authors'
]
output
+=
title
+
"
\t
"
+
source
+
"
\t
"
+
str
(
doc
[
'publication_year'
])
+
"
\t
"
+
str
(
doc
[
'publication_month'
])
+
"
\t
"
+
str
(
doc
[
'publication_day'
])
+
"
\t
"
+
abstract
+
"
\t
"
+
authors
+
"
\t
"
+
str
(
1
)
+
"
\n
"
output
+=
title
+
"
\t
"
+
source
+
"
\t
"
+
str
(
doc
[
'publication_year'
])
+
"
\t
"
+
str
(
doc
[
'publication_month'
])
+
"
\t
"
+
str
(
doc
[
'publication_day'
])
+
"
\t
"
+
abstract
+
"
\t
"
+
authors
+
"
\t
"
+
str
(
1
)
+
"
\n
"
return
output
st
.
write
(
st
.
session_state
.
general_text_dict
[
'text'
])
file
=
st
.
file_uploader
(
st
.
session_state
.
general_text_dict
[
'file'
],
type
=
[
"json"
],
key
=
'file'
)
file
=
st
.
file_uploader
(
st
.
session_state
.
general_text_dict
[
'file'
],
type
=
[
"json"
],
key
=
'file'
)
if
file
:
try
:
...
...
@@ -52,6 +53,4 @@ if file:
st
.
download_button
(
name
,
getText
(
df
),
name
)
except
Exception
as
e
:
st
.
write
(
st
.
session_state
.
general_text_dict
[
'error'
])
print
(
e
)
file
.
close
()
Streamlit/pages/Isidore_To_GarganText.py
View file @
cf004a85
...
...
@@ -217,7 +217,6 @@ with col1:
with
col2
:
st
.
image
(
'img/gargantext_logo.jpg'
)
# Form
form
=
st
.
form
(
'api'
)
...
...
Streamlit/pages/Istex_To_GarganText.py
View file @
cf004a85
...
...
@@ -15,7 +15,7 @@ tmp.base("IstexToGarganText")
def
read_zip
(
zip_file
):
output
=
[]
output
=
[]
dup
=
0
with
zipfile
.
ZipFile
(
zip_file
,
'r'
)
as
zip_ref
:
for
file
in
zip_ref
.
namelist
():
...
...
@@ -26,42 +26,44 @@ def read_zip(zip_file):
with
zip_ref
.
open
(
file
)
as
f
:
data
=
json
.
load
(
f
)
article
=
pd
.
json_normalize
(
data
)
article
=
pd
.
json_normalize
(
data
)
f
.
close
()
temp
=
{}
temp
[
"title"
]
=
article
.
get
(
"title"
,
''
)[
0
]
.
encode
(
encoding
=
'UTF-8'
,
errors
=
'ignore'
)
.
decode
(
"utf-8"
)
.
replace
(
"
\t
"
,
" "
)
temp
=
{}
temp
[
"title"
]
=
article
.
get
(
"title"
,
''
)[
0
]
.
encode
(
encoding
=
'UTF-8'
,
errors
=
'ignore'
)
.
decode
(
"utf-8"
)
.
replace
(
"
\t
"
,
" "
)
try
:
temp
[
"abstract"
]
=
article
.
get
(
"abstract"
,
""
)[
0
]
.
encode
(
encoding
=
'UTF-8'
,
errors
=
'ignore'
)
.
decode
(
"utf-8"
)
.
replace
(
"
\t
"
,
" "
)
temp
[
"abstract"
]
=
article
.
get
(
"abstract"
,
""
)[
0
]
.
encode
(
encoding
=
'UTF-8'
,
errors
=
'ignore'
)
.
decode
(
"utf-8"
)
.
replace
(
"
\t
"
,
" "
)
except
Exception
as
e
:
temp
[
"abstract"
]
=
''
try
:
authors
=
""
authors
=
""
for
author
in
article
[
"author"
][
0
]:
authors
+=
author
[
"name"
]
+
"; "
authors
=
authors
[:
-
2
]
authors
+=
author
[
"name"
]
+
"; "
authors
=
authors
[:
-
2
]
except
:
author
=
''
temp
[
"code"
]
=
article
.
get
(
"_id"
)[
0
]
temp
[
"authors"
]
=
authors
.
encode
(
encoding
=
'UTF-8'
,
errors
=
'ignore'
)
.
decode
(
"utf-8"
)
.
replace
(
"
\t
"
,
" "
)
temp
[
"authors"
]
=
authors
.
encode
(
encoding
=
'UTF-8'
,
errors
=
'ignore'
)
.
decode
(
"utf-8"
)
.
replace
(
"
\t
"
,
" "
)
try
:
temp
[
"source"
]
=
article
.
get
(
'host.title'
)[
0
]
.
encode
(
encoding
=
'UTF-8'
,
errors
=
'ignore'
)
.
decode
(
"utf-8"
)
.
replace
(
"
\t
"
,
" "
)
temp
[
"source"
]
=
article
.
get
(
'host.title'
)[
0
]
.
encode
(
encoding
=
'UTF-8'
,
errors
=
'ignore'
)
.
decode
(
"utf-8"
)
.
replace
(
"
\t
"
,
" "
)
except
:
temp
[
"source"
]
=
''
try
:
temp
[
"publication_year"
]
=
article
[
"publicationDate"
][
0
]
temp
[
"publication_year"
]
=
article
[
"publicationDate"
][
0
]
except
:
temp
[
"publication_year"
]
=
datetime
.
date
.
today
()
.
year
temp
[
"publication_year"
]
=
article
.
get
(
"publicationDate"
,
datetime
.
date
.
today
()
.
year
)[
0
]
temp
[
"publication_month"
]
=
1
temp
[
"publication_
day"
]
=
1
temp
[
"publication_year"
]
=
datetime
.
date
.
today
()
.
year
temp
[
"publication_year"
]
=
article
.
get
(
"publicationDate"
,
datetime
.
date
.
today
()
.
year
)[
0
]
temp
[
"publication_
month"
]
=
1
temp
[
"publication_day"
]
=
1
output
.
append
(
temp
)
...
...
@@ -70,8 +72,9 @@ def read_zip(zip_file):
zip_ref
.
close
()
output
=
pd
.
DataFrame
(
output
)
duplicated
=
output
[
'title'
]
.
str
.
lower
()
.
replace
(
","
,
""
,
regex
=
True
)
.
duplicated
()
output
=
pd
.
DataFrame
(
output
)
duplicated
=
output
[
'title'
]
.
str
.
lower
()
.
replace
(
","
,
""
,
regex
=
True
)
.
duplicated
()
if
(
duplicated
.
any
()):
dup
+=
duplicated
.
sum
()
...
...
@@ -80,15 +83,19 @@ def read_zip(zip_file):
df
=
pd
.
DataFrame
(
output
)
return
df
.
to_csv
(
index
=
False
,
sep
=
'
\t
'
),
dup
st
.
write
(
st
.
session_state
.
general_text_dict
[
'text'
])
file
=
st
.
file_uploader
(
st
.
session_state
.
general_text_dict
[
'file'
],
type
=
[
"zip"
],
key
=
'file'
)
file
=
st
.
file_uploader
(
st
.
session_state
.
general_text_dict
[
'file'
],
type
=
[
"zip"
],
key
=
'file'
)
if
file
:
try
:
name
=
file
.
name
.
split
(
'.'
)[
0
]
+
'.csv'
res
,
nb_dup
=
read_zip
(
file
)
if
nb_dup
:
st
.
write
(
st
.
session_state
.
general_text_dict
[
'dup1'
]
+
str
(
nb_dup
)
+
st
.
session_state
.
general_text_dict
[
'dup2'
])
st
.
write
(
st
.
session_state
.
general_text_dict
[
'dup1'
]
+
str
(
nb_dup
)
+
st
.
session_state
.
general_text_dict
[
'dup2'
])
st
.
write
(
st
.
session_state
.
general_text_dict
[
'new_file'
])
st
.
download_button
(
name
,
res
,
name
)
except
Exception
as
e
:
...
...
Streamlit/pages/Pubmed_To_GarganText.py
View file @
cf004a85
...
...
@@ -68,6 +68,7 @@ def read_file(file):
output
+=
row
return
output
st
.
write
(
st
.
session_state
.
general_text_dict
[
'text'
])
file
=
st
.
file_uploader
(
st
.
session_state
.
general_text_dict
[
'file'
],
type
=
[
"txt"
],
key
=
'file'
)
...
...
Streamlit/pages/Ris_To_GarganText.py
View file @
cf004a85
...
...
@@ -57,7 +57,6 @@ def read_file(file):
return
output
st
.
write
(
st
.
session_state
.
general_text_dict
[
'title'
])
st
.
write
(
st
.
session_state
.
general_text_dict
[
'text'
])
file
=
st
.
file_uploader
(
st
.
session_state
.
general_text_dict
[
'file'
],
type
=
[
"ris"
],
key
=
'file'
)
...
...
@@ -70,5 +69,4 @@ if file:
st
.
download_button
(
name
,
read_file
(
file
),
name
)
except
Exception
as
e
:
st
.
write
(
st
.
session_state
.
general_text_dict
[
'error'
])
print
(
e
)
file
.
close
()
Streamlit/pages/YTB_to_TSV.py
View file @
cf004a85
...
...
@@ -178,6 +178,7 @@ def transcriptToTsv(search, nbVideos):
bar
=
st
.
progress
(
count
/
nbVideos
,
dict
[
'loading'
]
+
str
(
count
)
+
dict
[
'quantity'
]
+
str
(
nbVideos
))
for
video
in
videos
:
print
(
count
)
bar
.
progress
(
count
/
nbVideos
,
dict
[
'loading'
]
+
str
(
count
)
+
dict
[
'quantity'
]
+
str
(
nbVideos
))
if
count
==
nbVideos
:
...
...
Streamlit/src/basic.py
View file @
cf004a85
...
...
@@ -5,6 +5,17 @@ from st_pages import show_pages_from_config, add_indentation
def
base
(
page
):
st
.
markdown
(
f
'''
<style>
.reportview-container .sidebar-content {{
padding-top: {1}rem;
}}
.reportview-container .main .block-container {{
padding-top: {1}rem;
}}
</style>
'''
,
unsafe_allow_html
=
True
)
show_pages_from_config
()
add_indentation
()
...
...
@@ -23,8 +34,8 @@ def base(page):
</style>
"""
,
unsafe_allow_html
=
True
)
# Load the language file
def
load_bundle
(
lang
):
df
=
pd
.
read_csv
(
"lang/text_"
+
page
+
".csv"
)
df
=
df
.
query
(
f
"locale == '{lang}'"
)
...
...
@@ -36,23 +47,28 @@ def base(page):
# Load the language file
def
update_lang
():
st
.
session_state
.
general_text_dict
=
load_bundle
(
st
.
session_state
.
general_lang_dict
[
st
.
session_state
.
general_language
])
st
.
session_state
.
general_text_dict
=
load_bundle
(
st
.
session_state
.
general_lang_dict
[
st
.
session_state
.
general_language
])
# Test if it's first connection on page or else if the last page was this one
if
'general_session_page'
not
in
st
.
session_state
.
keys
():
st
.
session_state
.
general_lang_dict
=
{
'Français'
:
'fr'
,
'English'
:
'en'
}
st
.
session_state
.
general_lang_dict
=
{
'Français'
:
'fr'
,
'English'
:
'en'
}
st
.
session_state
.
general_text_dict
=
load_bundle
(
'fr'
)
st
.
session_state
.
general_language
=
'Français'
st
.
session_state
.
general_session_page
=
page
elif
st
.
session_state
.
general_session_page
!=
page
:
st
.
session_state
.
general_text_dict
=
load_bundle
(
st
.
session_state
.
general_lang_dict
[
st
.
session_state
.
general_language
])
st
.
session_state
.
general_text_dict
=
load_bundle
(
st
.
session_state
.
general_lang_dict
[
st
.
session_state
.
general_language
])
st
.
session_state
.
general_session_page
=
page
# Delete every key who aren't fron this file
for
key
in
st
.
session_state
.
keys
():
if
'general_'
not
in
key
:
del
st
.
session_state
[
key
]
# select the lang
st
.
selectbox
(
'Langue'
,
list
(
st
.
session_state
.
general_lang_dict
.
keys
()),
list
(
st
.
session_state
.
general_lang_dict
.
keys
())
.
index
(
st
.
session_state
.
general_language
),
key
=
'general_language'
,
on_change
=
update_lang
)
st
.
write
(
st
.
session_state
.
general_text_dict
[
'title'
])
# select the lang
st
.
selectbox
(
'Langue'
,
list
(
st
.
session_state
.
general_lang_dict
.
keys
()),
list
(
st
.
session_state
.
general_lang_dict
.
keys
(
))
.
index
(
st
.
session_state
.
general_language
),
key
=
'general_language'
,
on_change
=
update_lang
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment