Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
G
GarganTexternal tools
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
1
Merge Requests
1
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Anne-Laure Thomas Derepas
GarganTexternal tools
Commits
2f362ef1
Commit
2f362ef1
authored
Sep 11, 2023
by
Anne-Laure Thomas Derepas
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'dev' into 'master'
Dev See merge request
!10
parents
f93762e6
92a7e0f5
Changes
20
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
319 additions
and
98 deletions
+319
-98
pages.toml
Streamlit/.streamlit/pages.toml
+2
-2
Homepage.py
Streamlit/Homepage.py
+3
-3
README.md
Streamlit/README.md
+1
-1
isc-pif_logo.png
Streamlit/img/isc-pif_logo.png
+0
-0
text_HALToGarganText.csv
Streamlit/lang/text_HALToGarganText.csv
+1
-1
text_Homepage.csv
Streamlit/lang/text_Homepage.csv
+0
-0
text_IsidoreToGarganText.csv
Streamlit/lang/text_IsidoreToGarganText.csv
+7
-7
text_PDFtoTSV.csv
Streamlit/lang/text_PDFtoTSV.csv
+2
-2
text_TXTtoTSV.csv
Streamlit/lang/text_TXTtoTSV.csv
+9
-6
CSV_Harzing_to_TSV.py
Streamlit/pages/CSV_Harzing_to_TSV.py
+1
-1
HAL_To_GarganText.py
Streamlit/pages/HAL_To_GarganText.py
+4
-2
Isidore_To_GarganText.py
Streamlit/pages/Isidore_To_GarganText.py
+17
-6
Istex_To_GarganText.py
Streamlit/pages/Istex_To_GarganText.py
+2
-2
PDF_to_TSV.py
Streamlit/pages/PDF_to_TSV.py
+14
-10
PDF_to_TXT.py
Streamlit/pages/PDF_to_TXT.py
+2
-2
TSV_Translator.py
Streamlit/pages/TSV_Translator.py
+7
-5
TXT_to_TSV.py
Streamlit/pages/TXT_to_TSV.py
+183
-16
YTB_to_TSV.py
Streamlit/pages/YTB_to_TSV.py
+50
-24
Zotero_To_GarganText.py
Streamlit/pages/Zotero_To_GarganText.py
+4
-2
basic.py
Streamlit/src/basic.py
+10
-6
No files found.
Streamlit/.streamlit/pages.toml
View file @
2f362ef1
[[pages]]
[[pages]]
path
=
"
Welcom
e.py"
path
=
"
Homepag
e.py"
name
=
"Home"
name
=
"Home
page
"
icon
=
":house:"
icon
=
":house:"
[[pages]]
[[pages]]
...
...
Streamlit/
Welcom
e.py
→
Streamlit/
Homepag
e.py
View file @
2f362ef1
...
@@ -4,13 +4,13 @@ Loïc Chapron
...
@@ -4,13 +4,13 @@ Loïc Chapron
"""
"""
import
streamlit
as
st
import
streamlit
as
st
import
src.basic
as
tmp
import
pandas
as
pd
import
src.basic
as
tmp
tmp
.
base
(
"
Welcom
e"
)
tmp
.
base
(
"
Homepag
e"
)
st
.
write
(
st
.
session_state
.
general_text_dict
[
'welcome'
])
st
.
write
(
st
.
session_state
.
general_text_dict
[
'welcome'
])
st
.
write
(
st
.
session_state
.
general_text_dict
[
'tools'
])
st
.
write
(
st
.
session_state
.
general_text_dict
[
'tools'
])
st
.
write
(
st
.
session_state
.
general_text_dict
[
'code'
])
st
.
write
(
st
.
session_state
.
general_text_dict
[
'code'
])
st
.
write
(
st
.
session_state
.
general_text_dict
[
'help'
])
st
.
write
(
st
.
session_state
.
general_text_dict
[
'help'
])
Streamlit/README.md
View file @
2f362ef1
...
@@ -15,7 +15,7 @@ pip install youtube-transcript-api
...
@@ -15,7 +15,7 @@ pip install youtube-transcript-api
## Start Project
## Start Project
```
shell
```
shell
streamlit run
welcom
e.py
streamlit run
Homepag
e.py
```
```
## About YTB to TSV tool
## About YTB to TSV tool
...
...
Streamlit/img/isc-pif_logo.png
0 → 100644
View file @
2f362ef1
6.33 KB
Streamlit/lang/text_HALToGarganText.csv
View file @
2f362ef1
...
@@ -31,5 +31,5 @@ en,perform2," the maximum number of documents"
...
@@ -31,5 +31,5 @@ en,perform2," the maximum number of documents"
fr,nb_taken,"Nombres de documents à prendre"
fr,nb_taken,"Nombres de documents à prendre"
en,nb_taken,"Number of documents to take"
en,nb_taken,"Number of documents to take"
fr,createTSV,"Création du fichier TSV (Cela peut prendre quelque minutes)"
fr,createTSV,"Création du fichier TSV (Cela peut prendre quelque
s
minutes)"
en,createTSV,"Creation of the TSV file (It may take a while)"
en,createTSV,"Creation of the TSV file (It may take a while)"
Streamlit/lang/text_
Welcom
e.csv
→
Streamlit/lang/text_
Homepag
e.csv
View file @
2f362ef1
File moved
Streamlit/lang/text_IsidoreToGarganText.csv
View file @
2f362ef1
...
@@ -2,7 +2,7 @@ locale,key,value
...
@@ -2,7 +2,7 @@ locale,key,value
fr,title,"# Isidore vers GarganText"
fr,title,"# Isidore vers GarganText"
en,title,"# Isidore To GarganText"
en,title,"# Isidore To GarganText"
fr,text,"Effectue une recherche Isidore de documents scientifiques et les converti
r
en un fichier TSV."
fr,text,"Effectue une recherche Isidore de documents scientifiques et les converti
t
en un fichier TSV."
en,text,"Do a Isidore scientific documents research and convert it into a TSV file."
en,text,"Do a Isidore scientific documents research and convert it into a TSV file."
fr,keyword,"Mots clés"
fr,keyword,"Mots clés"
...
@@ -17,21 +17,21 @@ en,submit,"Submit"
...
@@ -17,21 +17,21 @@ en,submit,"Submit"
fr,load_api,"Chargement de l'api..."
fr,load_api,"Chargement de l'api..."
en,load_api,"Loading API..."
en,load_api,"Loading API..."
fr,overload_api,"L'API est surchargé, relancer la requête dans quelques secondes"
fr,overload_api,"L'API est surchargé, relancer la requête dans quelques secondes
.
"
en,overload'api,"The API is overloaded, please retry the request in a few seconds"
en,overload'api,"The API is overloaded, please retry the request in a few seconds
.
"
fr,nb_doc,"Nombres de documents : "
fr,nb_doc,"Nombres de documents : "
en,nb_doc,"Numbers of documents : "
en,nb_doc,"Numbers of documents : "
fr,perform1,"Pour des raisons de performence, on limit à "
fr,perform1,"Pour des raisons de performence, on limit
e
à "
fr,perform2," le nombre
de document maximum
"
fr,perform2," le nombre
maximum de documents.
"
en,perform1,"For performance reasons, we limit to "
en,perform1,"For performance reasons, we limit to "
en,perform2,"
the maximum number of documents
"
en,perform2,"
,the maximum number of documents.
"
fr,nb_taken,"Nombres de documents à prendre"
fr,nb_taken,"Nombres de documents à prendre"
en,nb_taken,"Number of documents to take"
en,nb_taken,"Number of documents to take"
fr,createTSV,"Création du fichier TSV (Cela peut prendre quelque minutes)"
fr,createTSV,"Création du fichier TSV (Cela peut prendre quelque
s
minutes)"
en,createTSV,"Creation of the TSV file (It may take a while)"
en,createTSV,"Creation of the TSV file (It may take a while)"
fr,doc_abstract1,"Il y a "
fr,doc_abstract1,"Il y a "
...
...
Streamlit/lang/text_PDFtoTSV.csv
View file @
2f362ef1
...
@@ -29,8 +29,8 @@ en,watermark,"Watermark : "
...
@@ -29,8 +29,8 @@ en,watermark,"Watermark : "
fr,submit," Soumettre "
fr,submit," Soumettre "
en,submit,"Submit "
en,submit,"Submit "
fr,loading," Conversion du
pdf
en cours "
fr,loading," Conversion du
PDF
en cours "
en,loading," Processing
pdf
conversion "
en,loading," Processing
PDF
conversion "
fr,warning,"Attention ! Plusieurs langues ont été détectées pour la source : "
fr,warning,"Attention ! Plusieurs langues ont été détectées pour la source : "
fr,warning2,"Les langues suivantes ont été détectées : "
fr,warning2,"Les langues suivantes ont été détectées : "
...
...
Streamlit/lang/text_TXTtoTSV.csv
View file @
2f362ef1
...
@@ -5,11 +5,11 @@ en,title,"# TXT To TSV"
...
@@ -5,11 +5,11 @@ en,title,"# TXT To TSV"
fr,text,"Convertit un fichier TXT en un fichier TSV compatible avec Gargantext"
fr,text,"Convertit un fichier TXT en un fichier TSV compatible avec Gargantext"
en,text,"Convert a TXT file into a TSV file compatible with GarganText"
en,text,"Convert a TXT file into a TSV file compatible with GarganText"
fr,text2,"C
et outil détecte automatiquement les langues présentes au sein des PDF à l'aide de l'API Google Translate.
"
fr,text2,"C
onvertit un ZIP de fichiers TXT en fichiers TSV compatibles avec Gargantext
"
en,text2,"
This tool detect automatically the languages of the PDF with the Google Translate API.
"
en,text2,"
Convert a ZIP of TXT files into TSV files compatible with GarganText
"
fr,text3,"Vous pouvez choisir le titre et le(s) auteur(s)
et indiquer, s'il existe, le filigrane de ce PDF
."
fr,text3,"Vous pouvez choisir le titre et le(s) auteur(s)
de ce TXT
."
en,text3,"You can choose the title and the author(s)
and specify, if it does exist, the watermark for this PDF
."
en,text3,"You can choose the title and the author(s)
for this TXT
."
fr,file,"Choisir un fichier"
fr,file,"Choisir un fichier"
en,file,"Choose a file"
en,file,"Choose a file"
...
@@ -20,12 +20,15 @@ en,new_file,"Download your TSV file : "
...
@@ -20,12 +20,15 @@ en,new_file,"Download your TSV file : "
fr,author,"Auteur(s) : "
fr,author,"Auteur(s) : "
en,author,"Author(s) : "
en,author,"Author(s) : "
fr,title
PDF
,"Titre : "
fr,title
TXT
,"Titre : "
en,title
PDF
,"Title : "
en,title
TXT
,"Title : "
fr,submit," Soumettre "
fr,submit," Soumettre "
en,submit,"Submit "
en,submit,"Submit "
fr,loading," Conversion du TXT en cours "
en,loading," Processing TXT conversion "
fr,warning,"Attention ! Plusieurs langues ont été détectées pour la source : "
fr,warning,"Attention ! Plusieurs langues ont été détectées pour la source : "
fr,warning2,"Les langues suivantes ont été détectées : "
fr,warning2,"Les langues suivantes ont été détectées : "
en,warning,"Warning ! Multiple languages have been detected at the source : "
en,warning,"Warning ! Multiple languages have been detected at the source : "
...
...
Streamlit/pages/CSV_Harzing_to_TSV.py
View file @
2f362ef1
...
@@ -141,4 +141,4 @@ if st.session_state.page == 1:
...
@@ -141,4 +141,4 @@ if st.session_state.page == 1:
tsv
=
HarzingToTsv
(
separator
)
tsv
=
HarzingToTsv
(
separator
)
name
=
st
.
session_state
.
file
.
name
.
split
(
'.'
)[
0
]
+
'.tsv'
name
=
st
.
session_state
.
file
.
name
.
split
(
'.'
)[
0
]
+
'.tsv'
st
.
write
(
st
.
session_state
.
general_text_dict
[
'new_file'
])
st
.
write
(
st
.
session_state
.
general_text_dict
[
'new_file'
])
st
.
download_button
(
name
,
tsv
,
name
,
on_click
=
resetPage
()
)
st
.
download_button
(
name
,
tsv
,
name
,
on_click
=
resetPage
)
Streamlit/pages/HAL_To_GarganText.py
View file @
2f362ef1
...
@@ -190,5 +190,7 @@ if st.session_state.stage_isidore > 1:
...
@@ -190,5 +190,7 @@ if st.session_state.stage_isidore > 1:
print
(
st
.
session_state
.
nb_wanted
)
print
(
st
.
session_state
.
nb_wanted
)
st
.
session_state
.
output
=
create_output
(
st
.
session_state
.
output
=
create_output
(
st
.
session_state
.
search
,
lang
[
st
.
session_state
.
language
],
st
.
session_state
.
nb_wanted
)
st
.
session_state
.
search
,
lang
[
st
.
session_state
.
language
],
st
.
session_state
.
nb_wanted
)
st
.
download_button
(
'Download TSV'
,
st
.
session_state
.
output
,
'output.csv'
)
fileName
=
"HALOutput_"
+
str
(
datetime
.
now
()
.
strftime
(
"
%
Y-
%
m-
%
d_
%
H:
%
M:
%
S"
))
+
'.csv'
st
.
download_button
(
'Download TSV'
,
st
.
session_state
.
output
,
fileName
)
Streamlit/pages/Isidore_To_GarganText.py
View file @
2f362ef1
...
@@ -7,6 +7,7 @@ import streamlit as st
...
@@ -7,6 +7,7 @@ import streamlit as st
import
requests
as
req
import
requests
as
req
import
json
import
json
import
time
import
time
from
datetime
import
datetime
from
json
import
JSONDecodeError
from
json
import
JSONDecodeError
import
src.basic
as
tmp
import
src.basic
as
tmp
...
@@ -64,11 +65,16 @@ def create_output(search, language, nb_doc):
...
@@ -64,11 +65,16 @@ def create_output(search, language, nb_doc):
break
break
time
.
sleep
(
retryTime
)
time
.
sleep
(
retryTime
)
print
(
'Retry'
)
print
(
'Retry'
)
tmp
,
nb_tmp
=
createFile
(
txt
,
n
b_doc
%
n
umberReplies
,
language
)
tmp
,
nb_tmp
=
createFile
(
txt
,
numberReplies
,
language
)
output
+=
tmp
output
+=
tmp
nb
+=
nb_tmp
nb
+=
nb_tmp
if
nb_doc
%
numberReplies
!=
0
:
if
nb_doc
%
numberReplies
!=
0
:
txt
=
loadApiIsidorePage
(
search
,
language
,
nb_doc
//
numberReplies
+
1
)
while
(
True
):
txt
=
loadApiIsidorePage
(
search
,
language
,
nb_doc
//
numberReplies
+
1
)
if
txt
!=
0
:
break
time
.
sleep
(
retryTime
)
print
(
'Retry'
)
tmp
,
nb_tmp
=
createFile
(
txt
,
nb_doc
%
numberReplies
,
language
)
tmp
,
nb_tmp
=
createFile
(
txt
,
nb_doc
%
numberReplies
,
language
)
output
+=
tmp
output
+=
tmp
nb
+=
nb_tmp
nb
+=
nb_tmp
...
@@ -139,12 +145,16 @@ def createFile(docs, limit, language):
...
@@ -139,12 +145,16 @@ def createFile(docs, limit, language):
else
:
else
:
abstract
=
tmp
abstract
=
tmp
else
:
else
:
abstract
=
abstract
[
'$'
]
if
'$'
in
abstract
.
keys
():
abstract
=
abstract
[
'$'
]
else
:
abstract
=
''
if
'types'
in
doc
[
'isidore'
]
.
keys
():
if
'types'
in
doc
[
'isidore'
]
.
keys
():
if
type
(
doc
[
'isidore'
][
'types'
][
'type'
]
==
str
)
and
doc
[
'isidore'
][
'types'
][
'type'
]
in
[
'Books'
,
'text'
]:
print
(
i
)
if
type
(
doc
[
'isidore'
][
'types'
][
'type'
])
==
str
and
doc
[
'isidore'
][
'types'
][
'type'
]
in
[
'Books'
,
'text'
]:
nb
+=
1
nb
+=
1
elif
type
(
doc
[
'isidore'
][
'types'
][
'type'
]
==
dict
)
and
doc
[
'isidore'
][
'types'
][
'type'
][
1
]
in
[
'Books'
,
'text'
]:
elif
type
(
doc
[
'isidore'
][
'types'
][
'type'
]
)
==
dict
and
doc
[
'isidore'
][
'types'
][
'type'
][
'$'
]
in
[
'Books'
,
'text'
]:
nb
+=
1
nb
+=
1
else
:
else
:
print
(
title
)
print
(
title
)
...
@@ -280,4 +290,5 @@ if st.session_state.stage_isidore > 1:
...
@@ -280,4 +290,5 @@ if st.session_state.stage_isidore > 1:
st
.
write
(
st
.
session_state
.
general_text_dict
[
'doc_abstract1'
]
+
str
(
st
.
write
(
st
.
session_state
.
general_text_dict
[
'doc_abstract1'
]
+
str
(
st
.
session_state
.
nb_bad_file
)
+
st
.
session_state
.
general_text_dict
[
'doc_abstract2'
])
st
.
session_state
.
nb_bad_file
)
+
st
.
session_state
.
general_text_dict
[
'doc_abstract2'
])
st
.
download_button
(
'Download TSV'
,
st
.
session_state
.
output
,
'output.csv'
)
fileName
=
"isidoreOutput_"
+
str
(
datetime
.
now
()
.
strftime
(
"
%
Y-
%
m-
%
d_
%
H:
%
M:
%
S"
))
+
'.csv'
st
.
download_button
(
'Download TSV'
,
st
.
session_state
.
output
,
fileName
)
Streamlit/pages/Istex_To_GarganText.py
View file @
2f362ef1
...
@@ -91,13 +91,13 @@ file = st.file_uploader(
...
@@ -91,13 +91,13 @@ file = st.file_uploader(
if
file
:
if
file
:
try
:
try
:
name
=
file
.
name
.
split
(
'.'
)[
0
]
+
'.csv'
fileName
=
"istexOutput_"
+
str
(
datetime
.
now
()
.
strftime
(
"
%
Y-
%
m-
%
d_
%
H:
%
M:
%
S"
))
+
'.csv'
res
,
nb_dup
=
read_zip
(
file
)
res
,
nb_dup
=
read_zip
(
file
)
if
nb_dup
:
if
nb_dup
:
st
.
write
(
st
.
session_state
.
general_text_dict
[
'dup1'
]
+
str
(
st
.
write
(
st
.
session_state
.
general_text_dict
[
'dup1'
]
+
str
(
nb_dup
)
+
st
.
session_state
.
general_text_dict
[
'dup2'
])
nb_dup
)
+
st
.
session_state
.
general_text_dict
[
'dup2'
])
st
.
write
(
st
.
session_state
.
general_text_dict
[
'new_file'
])
st
.
write
(
st
.
session_state
.
general_text_dict
[
'new_file'
])
st
.
download_button
(
name
,
res
,
n
ame
)
st
.
download_button
(
'Download TSV'
,
res
,
fileN
ame
)
except
Exception
as
e
:
except
Exception
as
e
:
st
.
write
(
st
.
session_state
.
general_text_dict
[
'error'
])
st
.
write
(
st
.
session_state
.
general_text_dict
[
'error'
])
print
(
e
)
print
(
e
)
...
...
Streamlit/pages/PDF_to_TSV.py
View file @
2f362ef1
...
@@ -151,7 +151,7 @@ def segmentAbstract(fileName, fileAddress, tsv, author, source, year, month, day
...
@@ -151,7 +151,7 @@ def segmentAbstract(fileName, fileAddress, tsv, author, source, year, month, day
languages
=
detectLanguages
(
doc
,
languages
)
languages
=
detectLanguages
(
doc
,
languages
)
st
.
session_state
.
pdfLanguages
[
fileName
]
=
detectMultipleLanguages
(
st
.
session_state
.
pdfLanguages
[
fileName
]
=
detectMultipleLanguages
(
languages
,
fileName
)
languages
,
fileName
)
return
tsv
,
languages
return
tsv
def
correctedSequence
(
text
,
last
):
def
correctedSequence
(
text
,
last
):
...
@@ -173,21 +173,21 @@ def getInfo():
...
@@ -173,21 +173,21 @@ def getInfo():
return
st
.
session_state
.
author
,
title
,
st
.
session_state
.
watermark
return
st
.
session_state
.
author
,
title
,
st
.
session_state
.
watermark
def
txt
ToTSV
(
fileName
,
fileAddress
,
pdfDir
):
def
pdf
ToTSV
(
fileName
,
fileAddress
,
pdfDir
):
st
.
session_state
.
page
=
1
st
.
session_state
.
page
=
1
author
,
title
,
watermark
=
getInfo
()
author
,
title
,
watermark
=
getInfo
()
tsv
=
"authors
\t
source
\t
publication_year
\t
publication_month
\t
publication_day
\t
title
\t
abstract
\n
"
tsv
=
"authors
\t
source
\t
publication_year
\t
publication_month
\t
publication_day
\t
title
\t
abstract
\n
"
with
st
.
spinner
(
st
.
session_state
.
general_text_dict
[
'loading'
]):
with
st
.
spinner
(
st
.
session_state
.
general_text_dict
[
'loading'
]):
tsv
,
languages
=
segmentAbstract
(
fileName
,
fileAddress
,
tsv
,
author
,
title
,
tsv
=
segmentAbstract
(
fileName
,
fileAddress
,
tsv
,
author
,
title
,
str
(
date
.
today
()
.
year
),
"1"
,
"1"
,
watermark
)
str
(
date
.
today
()
.
year
),
"1"
,
"1"
,
watermark
)
if
'/'
in
fileName
:
if
'/'
in
fileName
:
fileName
=
fileName
.
split
(
'/'
)[
1
]
fileName
=
fileName
.
split
(
'/'
)[
1
]
with
open
(
pdfDir
+
"/"
+
fileName
.
replace
(
".pdf"
,
".tsv"
),
"w"
,
encoding
=
"utf-8-sig"
)
as
file
:
with
open
(
pdfDir
+
"/"
+
fileName
.
replace
(
".pdf"
,
"
(pdf)
.tsv"
),
"w"
,
encoding
=
"utf-8-sig"
)
as
file
:
file
.
write
(
tsv
)
file
.
write
(
tsv
)
tsv
=
"
\n
"
.
join
(
tsv
.
split
(
"
\n
"
)[
1
:])
tsv
=
"
\n
"
.
join
(
tsv
.
split
(
"
\n
"
)[
1
:])
return
tsv
,
languages
return
tsv
def
extractAllPDF
(
zipDir
,
zipFile
):
def
extractAllPDF
(
zipDir
,
zipFile
):
...
@@ -223,6 +223,10 @@ def setSubmit():
...
@@ -223,6 +223,10 @@ def setSubmit():
st
.
session_state
.
submit
=
True
st
.
session_state
.
submit
=
True
def
resetPage
():
st
.
session_state
.
page
=
0
def
upPage
():
def
upPage
():
st
.
session_state
.
page
=
2
st
.
session_state
.
page
=
2
...
@@ -234,7 +238,7 @@ def uploadZip():
...
@@ -234,7 +238,7 @@ def uploadZip():
st
.
file_uploader
(
st
.
file_uploader
(
st
.
session_state
.
general_text_dict
[
'file'
],
type
=
[
"zip"
],
key
=
'file'
)
st
.
session_state
.
general_text_dict
[
'file'
],
type
=
[
"zip"
],
key
=
'file'
)
st
.
form_submit_button
(
st
.
form_submit_button
(
st
.
session_state
.
general_text_dict
[
'submit'
],
on_click
=
setSubmit
()
)
st
.
session_state
.
general_text_dict
[
'submit'
],
on_click
=
setSubmit
)
def
askPDF
(
fileName
):
def
askPDF
(
fileName
):
...
@@ -254,7 +258,7 @@ def askPDF(fileName):
...
@@ -254,7 +258,7 @@ def askPDF(fileName):
st
.
text_input
(
st
.
text_input
(
st
.
session_state
.
general_text_dict
[
'watermark'
],
key
=
'watermark'
)
st
.
session_state
.
general_text_dict
[
'watermark'
],
key
=
'watermark'
)
st
.
form_submit_button
(
st
.
form_submit_button
(
st
.
session_state
.
general_text_dict
[
'submit'
],
on_click
=
upPage
()
)
st
.
session_state
.
general_text_dict
[
'submit'
],
on_click
=
upPage
)
# Page Code End
# Page Code End
...
@@ -279,7 +283,7 @@ if st.session_state.page == 0:
...
@@ -279,7 +283,7 @@ if st.session_state.page == 0:
if
st
.
session_state
.
page
==
2
:
if
st
.
session_state
.
page
==
2
:
fileName
=
st
.
session_state
.
fileName
fileName
=
st
.
session_state
.
fileName
tmp
,
languages
=
txt
ToTSV
(
tmp
=
pdf
ToTSV
(
fileName
,
st
.
session_state
.
zipDir
.
name
+
'/'
+
fileName
,
st
.
session_state
.
pdfDir
.
name
)
fileName
,
st
.
session_state
.
zipDir
.
name
+
'/'
+
fileName
,
st
.
session_state
.
pdfDir
.
name
)
st
.
session_state
.
tsv
+=
"
\n
"
+
tmp
st
.
session_state
.
tsv
+=
"
\n
"
+
tmp
if
st
.
session_state
.
nbDoc
==
st
.
session_state
.
len
-
1
:
if
st
.
session_state
.
nbDoc
==
st
.
session_state
.
len
-
1
:
...
@@ -306,4 +310,4 @@ if st.session_state.page == 3:
...
@@ -306,4 +310,4 @@ if st.session_state.page == 3:
detectMultiplePdfLanguages
()
detectMultiplePdfLanguages
()
st
.
write
(
st
.
session_state
.
general_text_dict
[
'new_file'
])
st
.
write
(
st
.
session_state
.
general_text_dict
[
'new_file'
])
st
.
download_button
(
"PDFCompilation.zip"
,
st
.
download_button
(
"PDFCompilation.zip"
,
zip
,
"PDFCompilation.zip"
)
zip
,
"PDFCompilation.zip"
,
on_click
=
resetPage
)
Streamlit/pages/PDF_to_TXT.py
View file @
2f362ef1
...
@@ -97,7 +97,7 @@ def askPDF():
...
@@ -97,7 +97,7 @@ def askPDF():
st
.
file_uploader
(
st
.
file_uploader
(
st
.
session_state
.
general_text_dict
[
'file'
],
type
=
[
"pdf"
],
key
=
'file'
)
st
.
session_state
.
general_text_dict
[
'file'
],
type
=
[
"pdf"
],
key
=
'file'
)
st
.
form_submit_button
(
st
.
form_submit_button
(
st
.
session_state
.
general_text_dict
[
'submit'
],
on_click
=
setSubmit
()
)
st
.
session_state
.
general_text_dict
[
'submit'
],
on_click
=
setSubmit
)
# Page Code End
# Page Code End
...
@@ -124,4 +124,4 @@ if st.session_state.page == 1:
...
@@ -124,4 +124,4 @@ if st.session_state.page == 1:
name
=
st
.
session_state
.
file
.
name
.
split
(
'.'
)[
0
]
+
'.txt'
name
=
st
.
session_state
.
file
.
name
.
split
(
'.'
)[
0
]
+
'.txt'
st
.
write
(
st
.
session_state
.
general_text_dict
[
'new_file'
])
st
.
write
(
st
.
session_state
.
general_text_dict
[
'new_file'
])
st
.
session_state
.
submit
=
False
st
.
session_state
.
submit
=
False
st
.
download_button
(
name
,
txt
,
name
,
on_click
=
setPage
()
)
st
.
download_button
(
name
,
txt
,
name
,
on_click
=
setPage
)
Streamlit/pages/TSV_Translator.py
View file @
2f362ef1
...
@@ -43,12 +43,13 @@ def estimateLanguagesPercentage(languages):
...
@@ -43,12 +43,13 @@ def estimateLanguagesPercentage(languages):
for
l
in
languages
:
for
l
in
languages
:
total
+=
languages
[
l
]
total
+=
languages
[
l
]
for
l
in
languages
:
for
l
in
languages
:
tmp
=
(
languages
[
l
]
/
total
)
*
100
tmp
=
round
((
languages
[
l
]
/
total
)
*
100
,
1
)
if
tmp
>=
15
:
if
tmp
>=
15
:
res
[
l
]
=
tmp
res
[
l
]
=
tmp
if
st
.
session_state
.
detected
!=
""
:
if
st
.
session_state
.
detected
!=
""
:
st
.
session_state
.
detected
+=
"| "
st
.
session_state
.
detected
+=
"| "
st
.
session_state
.
detected
+=
l
+
" : "
+
str
(
tmp
)
+
"
%
"
st
.
session_state
.
detected
+=
l
+
" : "
+
str
(
tmp
)
+
"
%
"
print
(
res
)
return
res
return
res
...
@@ -182,10 +183,11 @@ def uploadTSV():
...
@@ -182,10 +183,11 @@ def uploadTSV():
with
st
.
form
(
"Detect"
):
with
st
.
form
(
"Detect"
):
st
.
write
(
st
.
session_state
.
general_text_dict
[
'text'
])
st
.
write
(
st
.
session_state
.
general_text_dict
[
'text'
])
st
.
write
(
st
.
session_state
.
general_text_dict
[
'text2'
])
st
.
file_uploader
(
st
.
file_uploader
(
st
.
session_state
.
general_text_dict
[
'file'
],
type
=
[
"tsv"
,
"csv"
],
key
=
'file'
)
st
.
session_state
.
general_text_dict
[
'file'
],
type
=
[
"tsv"
,
"csv"
],
key
=
'file'
)
st
.
form_submit_button
(
st
.
form_submit_button
(
st
.
session_state
.
general_text_dict
[
'detect'
],
on_click
=
setDetect
()
)
st
.
session_state
.
general_text_dict
[
'detect'
],
on_click
=
setDetect
)
def
askTranslateLanguages
(
file
):
def
askTranslateLanguages
(
file
):
...
@@ -202,7 +204,7 @@ def askTranslateLanguages(file):
...
@@ -202,7 +204,7 @@ def askTranslateLanguages(file):
st
.
selectbox
(
st
.
session_state
.
general_text_dict
[
'translate2'
],
st
.
session_state
.
languages
,
st
.
selectbox
(
st
.
session_state
.
general_text_dict
[
'translate2'
],
st
.
session_state
.
languages
,
key
=
'destLang'
)
key
=
'destLang'
)
st
.
form_submit_button
(
st
.
form_submit_button
(
st
.
session_state
.
general_text_dict
[
'submit'
],
on_click
=
setSubmit
()
)
st
.
session_state
.
general_text_dict
[
'submit'
],
on_click
=
setSubmit
)
# Page Code End
# Page Code End
...
@@ -239,11 +241,11 @@ if st.session_state.page == 2:
...
@@ -239,11 +241,11 @@ if st.session_state.page == 2:
st
.
write
(
st
.
session_state
.
general_text_dict
[
'new_file'
])
st
.
write
(
st
.
session_state
.
general_text_dict
[
'new_file'
])
name
=
st
.
session_state
.
tmpFile
.
name
name
=
st
.
session_state
.
tmpFile
.
name
st
.
download_button
(
name
,
st
.
download_button
(
name
,
tsv
,
name
,
on_click
=
resetPage
()
)
tsv
,
name
,
on_click
=
resetPage
)
if
st
.
session_state
.
page
==
3
:
if
st
.
session_state
.
page
==
3
:
st
.
write
(
st
.
write
(
st
.
session_state
.
general_text_dict
[
'sameLanguages'
]
+
list
(
st
.
session_state
.
languages
.
keys
())[
0
])
st
.
session_state
.
general_text_dict
[
'sameLanguages'
]
+
list
(
st
.
session_state
.
languages
.
keys
())[
0
])
st
.
session_state
.
languages
=
{}
st
.
session_state
.
languages
=
{}
st
.
button
(
st
.
button
(
st
.
session_state
.
general_text_dict
[
'anotherFile'
],
on_click
=
resetPage
()
)
st
.
session_state
.
general_text_dict
[
'anotherFile'
],
on_click
=
resetPage
)
Streamlit/pages/TXT_to_TSV.py
View file @
2f362ef1
...
@@ -4,13 +4,12 @@ Nicolas Atrax
...
@@ -4,13 +4,12 @@ Nicolas Atrax
"""
"""
import
streamlit
as
st
import
streamlit
as
st
import
pandas
as
pd
import
zipfile
import
chardet
import
tempfile
import
shutil
import
os
import
re
import
re
from
datetime
import
date
from
datetime
import
date
import
codecs
import
os
import
tempfile
from
lib.langdetect.langdetect
import
detect
from
lib.langdetect.langdetect
import
detect
from
lib.langdetect.langdetect.lang_detect_exception
import
LangDetectException
from
lib.langdetect.langdetect.lang_detect_exception
import
LangDetectException
import
src.basic
as
tmp
import
src.basic
as
tmp
...
@@ -70,6 +69,18 @@ def detectMultipleLanguages(languages, fileName):
...
@@ -70,6 +69,18 @@ def detectMultipleLanguages(languages, fileName):
return
principal
return
principal
def
detectMultipleTxtLanguages
():
languages
=
[]
for
l
in
st
.
session_state
.
txtLanguages
.
values
():
if
l
not
in
languages
and
len
(
languages
)
==
1
:
st
.
info
(
st
.
session_state
.
general_text_dict
[
'globalWarning'
])
st
.
info
(
str
(
st
.
session_state
.
txtLanguages
))
st
.
info
(
st
.
session_state
.
general_text_dict
[
'advice'
])
return
if
len
(
languages
)
==
0
:
languages
.
append
(
l
)
def
segmentAbstract
(
abstract
,
tsv
):
def
segmentAbstract
(
abstract
,
tsv
):
year
=
str
(
date
.
today
()
.
year
)
year
=
str
(
date
.
today
()
.
year
)
month
=
"1"
month
=
"1"
...
@@ -101,6 +112,35 @@ def segmentAbstract(abstract, tsv):
...
@@ -101,6 +112,35 @@ def segmentAbstract(abstract, tsv):
return
tsv
return
tsv
def
segmentAbstract2
(
abstract
,
tsv
,
author
,
title
):
year
=
str
(
date
.
today
()
.
year
)
month
=
"1"
day
=
"1"
source
=
title
nbLines
=
len
(
abstract
)
n
=
0
count
=
1
languages
=
{}
while
n
<
nbLines
-
2
:
doc
=
""
.
join
(
abstract
[
n
:
n
+
9
])
.
replace
(
"�"
,
""
)
title
=
source
+
" : Part "
+
str
(
count
)
tsv
+=
correctedSequence
(
author
,
False
)
+
"
\t
"
+
correctedSequence
(
source
,
False
)
+
"
\t
"
+
year
+
"
\t
"
+
month
+
"
\t
"
+
day
+
"
\t
"
tsv
+=
correctedSequence
(
title
,
False
)
+
"
\t
"
tsv
+=
correctedSequence
(
doc
,
True
)
if
tsv
[
-
1
]
!=
"
\n
"
:
tsv
+=
"
\n
"
n
+=
7
count
+=
1
if
n
>
nbLines
-
9
and
n
!=
nbLines
-
2
:
n
=
nbLines
-
9
languages
=
detectLanguages
(
doc
,
languages
)
st
.
session_state
.
txtLanguages
[
fileName
]
=
detectMultipleLanguages
(
languages
,
source
)
return
tsv
def
correctedSequence
(
text
,
last
):
def
correctedSequence
(
text
,
last
):
tmp
=
text
.
replace
(
"
\"
"
,
"
\"\"
"
)
tmp
=
text
.
replace
(
"
\"
"
,
"
\"\"
"
)
find
=
"
\t
"
in
text
or
"
\"
"
in
text
or
"
\n
"
in
text
find
=
"
\t
"
in
text
or
"
\"
"
in
text
or
"
\n
"
in
text
...
@@ -115,16 +155,31 @@ def correctedSequence(text, last):
...
@@ -115,16 +155,31 @@ def correctedSequence(text, last):
def
getTxt
():
def
getTxt
():
txt
=
[]
txt
=
[]
st
.
session_state
.
pdf
Dir
=
tempfile
.
TemporaryDirectory
()
st
.
session_state
.
tmp
Dir
=
tempfile
.
TemporaryDirectory
()
name
=
st
.
session_state
.
file
.
name
name
=
st
.
session_state
.
file
.
name
with
open
(
st
.
session_state
.
pdf
Dir
.
name
+
"/"
+
name
,
"wb"
)
as
file
:
with
open
(
st
.
session_state
.
tmp
Dir
.
name
+
"/"
+
name
,
"wb"
)
as
file
:
file
.
write
(
st
.
session_state
.
file
.
getvalue
())
file
.
write
(
st
.
session_state
.
file
.
getvalue
())
with
open
(
st
.
session_state
.
pdf
Dir
.
name
+
"/"
+
name
,
"r"
)
as
file
:
with
open
(
st
.
session_state
.
tmp
Dir
.
name
+
"/"
+
name
,
"r"
)
as
file
:
for
line
in
file
:
for
line
in
file
:
txt
.
append
(
line
)
txt
.
append
(
line
)
return
txt
return
txt
def
getTxt2
(
fileAddress
):
txt
=
[]
with
open
(
fileAddress
,
"r"
)
as
file
:
for
line
in
file
:
txt
.
append
(
line
)
return
txt
def
getInfo
():
title
=
st
.
session_state
.
title
if
title
==
""
:
title
=
st
.
session_state
.
fileName
.
replace
(
".txt"
,
""
)
return
st
.
session_state
.
author
,
title
def
txtToTSV
():
def
txtToTSV
():
fileName
=
st
.
session_state
.
file
.
name
fileName
=
st
.
session_state
.
file
.
name
...
@@ -135,6 +190,32 @@ def txtToTSV():
...
@@ -135,6 +190,32 @@ def txtToTSV():
return
tsv
return
tsv
def
txtToTSV2
(
fileName
,
fileAddress
,
txtDir
):
st
.
session_state
.
page
=
1
author
,
title
=
getInfo
()
abstract
=
getTxt2
(
fileAddress
)
tsv
=
"authors
\t
source
\t
publication_year
\t
publication_month
\t
publication_day
\t
title
\t
abstract
\n
"
with
st
.
spinner
(
st
.
session_state
.
general_text_dict
[
'loading'
]):
tsv
=
segmentAbstract2
(
abstract
,
tsv
,
author
,
title
)
if
'/'
in
fileName
:
fileName
=
fileName
.
split
(
'/'
)[
1
]
with
open
(
txtDir
+
"/"
+
fileName
.
replace
(
".txt"
,
"(txt).tsv"
),
"w"
,
encoding
=
"utf-8-sig"
)
as
file
:
file
.
write
(
tsv
)
tsv
=
"
\n
"
.
join
(
tsv
.
split
(
"
\n
"
)[
1
:])
return
tsv
def
extractAllTXT
(
zipDir
,
zipFile
):
with
zipfile
.
ZipFile
(
zipFile
)
as
zipRef
:
zipInfos
=
zipRef
.
infolist
()
for
info
in
zipInfos
:
while
'/'
in
info
.
filename
and
len
(
info
.
filename
.
split
(
'/'
))
>
1
:
info
.
filename
=
"/"
.
join
(
info
.
filename
.
split
(
'/'
)[
1
:])
if
".txt"
in
info
.
filename
:
zipRef
.
extract
(
info
,
zipDir
)
# Tool Code End
# Tool Code End
...
@@ -148,20 +229,45 @@ if 'page' not in st.session_state:
...
@@ -148,20 +229,45 @@ if 'page' not in st.session_state:
if
'warning'
not
in
st
.
session_state
:
if
'warning'
not
in
st
.
session_state
:
st
.
session_state
.
warning
=
""
st
.
session_state
.
warning
=
""
if
'submit'
not
in
st
.
session_state
:
st
.
session_state
.
submit
=
False
if
'zipSubmit'
not
in
st
.
session_state
:
st
.
session_state
.
zipSubmit
=
False
if
'txtLanguages'
not
in
st
.
session_state
:
st
.
session_state
.
txtLanguages
=
{}
def
setSubmit
():
def
setSubmit
():
st
.
session_state
.
submit
=
True
st
.
session_state
.
submit
=
True
def
setPage
():
def
setZIPSubmit
():
st
.
session_state
.
zipSubmit
=
True
def
resetPage
():
st
.
session_state
.
page
=
0
st
.
session_state
.
page
=
0
def
upPage
():
st
.
session_state
.
page
=
3
def
uploadZip
():
with
st
.
form
(
"Submit2"
):
st
.
write
(
st
.
session_state
.
general_text_dict
[
'text2'
])
st
.
file_uploader
(
st
.
session_state
.
general_text_dict
[
'file'
],
type
=
[
"zip"
],
key
=
'zipFile'
)
st
.
form_submit_button
(
st
.
session_state
.
general_text_dict
[
'submit'
],
on_click
=
setZIPSubmit
)
def
askTXT
():
def
askTXT
():
with
st
.
form
(
"Submit"
):
with
st
.
form
(
"Submit"
):
st
.
write
(
st
.
session_state
.
general_text_dict
[
'text'
])
st
.
write
(
st
.
session_state
.
general_text_dict
[
'text'
])
st
.
write
(
st
.
session_state
.
general_text_dict
[
'text2'
])
st
.
write
(
st
.
session_state
.
general_text_dict
[
'text3'
])
st
.
write
(
st
.
session_state
.
general_text_dict
[
'text3'
])
col1
,
col2
=
st
.
columns
(
2
)
col1
,
col2
=
st
.
columns
(
2
)
st
.
session_state
.
author
=
""
st
.
session_state
.
author
=
""
...
@@ -170,28 +276,58 @@ def askTXT():
...
@@ -170,28 +276,58 @@ def askTXT():
st
.
session_state
.
general_text_dict
[
'author'
],
key
=
'author'
)
st
.
session_state
.
general_text_dict
[
'author'
],
key
=
'author'
)
with
col2
:
with
col2
:
st
.
text_input
(
st
.
text_input
(
st
.
session_state
.
general_text_dict
[
'title
PDF
'
],
key
=
'title'
)
st
.
session_state
.
general_text_dict
[
'title
TXT
'
],
key
=
'title'
)
st
.
file_uploader
(
st
.
file_uploader
(
st
.
session_state
.
general_text_dict
[
'file'
],
type
=
[
"txt"
],
key
=
'file'
)
st
.
session_state
.
general_text_dict
[
'file'
],
type
=
[
"txt"
],
key
=
'file'
)
st
.
form_submit_button
(
st
.
form_submit_button
(
st
.
session_state
.
general_text_dict
[
'submit'
],
on_click
=
setSubmit
())
st
.
session_state
.
general_text_dict
[
'submit'
],
on_click
=
setSubmit
)
def
askTXT2
(
fileName
):
with
st
.
form
(
"Submit"
):
st
.
write
(
fileName
)
st
.
write
(
st
.
session_state
.
general_text_dict
[
'text3'
])
col1
,
col2
=
st
.
columns
(
2
)
st
.
session_state
.
author
=
""
with
col1
:
st
.
text_input
(
st
.
session_state
.
general_text_dict
[
'author'
],
key
=
'author'
)
with
col2
:
st
.
text_input
(
st
.
session_state
.
general_text_dict
[
'titleTXT'
],
key
=
'title'
)
st
.
form_submit_button
(
st
.
session_state
.
general_text_dict
[
'submit'
],
on_click
=
upPage
)
# Page Code End
# Page Code End
if
'submit'
not
in
st
.
session_state
:
st
.
session_state
.
submit
=
False
if
st
.
session_state
.
page
==
0
:
if
st
.
session_state
.
page
==
0
:
if
st
.
session_state
.
submit
:
if
st
.
session_state
.
submit
:
st
.
session_state
.
submit
=
False
st
.
session_state
.
submit
=
False
if
st
.
session_state
.
file
!=
None
:
if
st
.
session_state
.
file
!=
None
:
print
(
st
.
session_state
.
file
)
st
.
session_state
.
page
=
1
st
.
session_state
.
page
=
1
else
:
else
:
askTXT
()
askTXT
()
uploadZip
()
elif
st
.
session_state
.
zipSubmit
:
st
.
session_state
.
zipSubmit
=
False
if
st
.
session_state
.
zipFile
!=
None
:
st
.
session_state
.
zipDir
=
tempfile
.
TemporaryDirectory
()
st
.
session_state
.
txtDir
=
tempfile
.
TemporaryDirectory
()
st
.
session_state
.
tsv
=
"authors
\t
source
\t
publication_year
\t
publication_month
\t
publication_day
\t
title
\t
abstract"
extractAllTXT
(
st
.
session_state
.
zipDir
.
name
,
st
.
session_state
.
zipFile
)
st
.
session_state
.
len
=
len
(
os
.
listdir
(
st
.
session_state
.
zipDir
.
name
))
st
.
session_state
.
nbDoc
=
0
st
.
session_state
.
page
=
2
else
:
askTXT
()
uploadZip
()
else
:
else
:
askTXT
()
askTXT
()
uploadZip
()
if
st
.
session_state
.
page
==
1
:
if
st
.
session_state
.
page
==
1
:
name
=
st
.
session_state
.
file
.
name
name
=
st
.
session_state
.
file
.
name
...
@@ -201,4 +337,35 @@ if st.session_state.page == 1:
...
@@ -201,4 +337,35 @@ if st.session_state.page == 1:
st
.
session_state
.
submit
=
False
st
.
session_state
.
submit
=
False
if
st
.
session_state
.
warning
!=
""
:
if
st
.
session_state
.
warning
!=
""
:
st
.
info
(
st
.
session_state
.
warning
)
st
.
info
(
st
.
session_state
.
warning
)
st
.
download_button
(
name
,
txt
,
name
,
on_click
=
setPage
())
st
.
download_button
(
name
,
txt
,
name
,
on_click
=
resetPage
)
if
st
.
session_state
.
page
==
3
:
fileName
=
st
.
session_state
.
fileName
tmp
=
txtToTSV2
(
fileName
,
st
.
session_state
.
zipDir
.
name
+
'/'
+
fileName
,
st
.
session_state
.
txtDir
.
name
)
st
.
session_state
.
tsv
+=
"
\n
"
+
tmp
if
st
.
session_state
.
nbDoc
==
st
.
session_state
.
len
-
1
:
st
.
session_state
.
page
=
4
else
:
st
.
session_state
.
nbDoc
+=
1
st
.
session_state
.
page
=
2
if
st
.
session_state
.
page
==
2
:
fileName
=
os
.
listdir
(
st
.
session_state
.
zipDir
.
name
)[
st
.
session_state
.
nbDoc
]
st
.
session_state
.
fileName
=
fileName
if
'/'
in
fileName
:
fileName
=
fileName
.
split
(
'/'
)[
1
]
askTXT2
(
fileName
)
if
st
.
session_state
.
page
==
4
:
with
open
(
st
.
session_state
.
txtDir
.
name
+
"/TXTCompilation.tsv"
,
"w"
,
encoding
=
'utf-8-sig'
)
as
file
:
file
.
write
(
st
.
session_state
.
tsv
)
shutil
.
make_archive
(
st
.
session_state
.
zipDir
.
name
+
"/TXTCompilation"
,
'zip'
,
st
.
session_state
.
txtDir
.
name
)
with
open
(
st
.
session_state
.
zipDir
.
name
+
"/TXTCompilation.zip"
,
'rb'
)
as
zip
:
if
st
.
session_state
.
warning
!=
""
:
st
.
info
(
st
.
session_state
.
warning
)
detectMultipleTxtLanguages
()
st
.
write
(
st
.
session_state
.
general_text_dict
[
'new_file'
])
st
.
download_button
(
"TXTCompilation.zip"
,
zip
,
"TXTCompilation.zip"
,
on_click
=
resetPage
)
Streamlit/pages/YTB_to_TSV.py
View file @
2f362ef1
...
@@ -4,11 +4,6 @@ Nicolas Atrax
...
@@ -4,11 +4,6 @@ Nicolas Atrax
"""
"""
import
streamlit
as
st
import
streamlit
as
st
import
pandas
as
pd
import
chardet
import
re
import
codecs
import
os
import
tempfile
import
tempfile
import
shutil
import
shutil
from
datetime
import
date
from
datetime
import
date
...
@@ -17,6 +12,8 @@ from lib.youtubetranscript.youtube_transcript_api import YouTubeTranscriptApi
...
@@ -17,6 +12,8 @@ from lib.youtubetranscript.youtube_transcript_api import YouTubeTranscriptApi
from
lib.youtubetranscript.youtube_transcript_api._transcripts
import
NoTranscriptFound
from
lib.youtubetranscript.youtube_transcript_api._transcripts
import
NoTranscriptFound
from
lib.youtubetranscript.youtube_transcript_api._transcripts
import
TranscriptsDisabled
from
lib.youtubetranscript.youtube_transcript_api._transcripts
import
TranscriptsDisabled
import
src.basic
as
tmp
import
src.basic
as
tmp
import
time
import
random
tmp
.
base
(
"YTBtoTSV"
)
tmp
.
base
(
"YTBtoTSV"
)
...
@@ -24,7 +21,11 @@ tmp.base("YTBtoTSV")
...
@@ -24,7 +21,11 @@ tmp.base("YTBtoTSV")
def
ytbSearch
(
search
,
n
):
def
ytbSearch
(
search
,
n
):
videosSearch
=
VideosSearch
(
search
)
if
st
.
session_state
.
videoLang
==
'fr'
:
region
=
'FR'
else
:
region
=
'US'
videosSearch
=
VideosSearch
(
search
,
region
=
region
)
result
=
videosSearch
.
result
()[
"result"
]
result
=
videosSearch
.
result
()[
"result"
]
videos
=
[]
videos
=
[]
while
len
(
videos
)
<
n
:
while
len
(
videos
)
<
n
:
...
@@ -35,8 +36,11 @@ def ytbSearch(search, n):
...
@@ -35,8 +36,11 @@ def ytbSearch(search, n):
videos
.
append
([
id
,
author
,
title
])
videos
.
append
([
id
,
author
,
title
])
if
len
(
videos
)
==
n
:
if
len
(
videos
)
==
n
:
break
break
if
len
(
videos
)
==
n
:
break
tmpResult
=
result
tmpResult
=
result
videosSearch
.
next
()
videosSearch
.
next
()
time
.
sleep
(
1.0
)
result
=
videosSearch
.
result
()[
"result"
]
result
=
videosSearch
.
result
()[
"result"
]
if
result
==
tmpResult
:
if
result
==
tmpResult
:
break
break
...
@@ -44,27 +48,37 @@ def ytbSearch(search, n):
...
@@ -44,27 +48,37 @@ def ytbSearch(search, n):
def
getLang
(
list
):
def
getLang
(
list
):
tmp
=
""
for
lang
in
list
:
for
lang
in
list
:
return
str
(
lang
)
.
split
(
" "
)[
0
]
tmp
=
str
(
lang
)
.
split
(
" "
)[
0
]
if
tmp
==
st
.
session_state
.
videoLang
:
break
return
tmp
def
translatedTranscript
(
lang
,
lst
,
title
,
manual
):
def
translateTranscript
(
lst
,
lang
):
if
lang
!=
"en"
:
origin
=
lst
.
find_transcript
([
lang
])
res
=
lst
.
find_transcript
([
lang
])
manual
=
not
origin
.
is_generated
trans
=
res
.
translate
(
"en"
)
.
fetch
()
return
origin
.
translate
(
st
.
session_state
.
videoLang
)
.
fetch
(),
manual
return
trans
return
lst
.
find_transcript
([
lang
])
.
fetch
()
def
ytbTranscript
(
id
,
title
):
def
ytbTranscript
(
id
,
title
):
try
:
try
:
transcriptList
=
YouTubeTranscriptApi
.
list_transcripts
(
id
)
transcriptList
=
YouTubeTranscriptApi
.
list_transcripts
(
id
)
lang
=
getLang
(
transcriptList
)
lang
=
getLang
(
transcriptList
)
if
lang
!=
st
.
session_state
.
videoLang
:
return
translateTranscript
(
transcriptList
,
lang
)
try
:
try
:
transcriptList
.
find_manually_created_transcript
([
lang
])
transcript
=
transcriptList
.
find_manually_created_transcript
(
return
translatedTranscript
(
lang
,
transcriptList
,
title
,
True
),
True
[
st
.
session_state
.
videoLang
])
.
fetch
()
return
transcript
,
True
except
NoTranscriptFound
:
except
NoTranscriptFound
:
return
translatedTranscript
(
lang
,
transcriptList
,
title
,
False
),
False
try
:
transcript
=
transcriptList
.
find_generated_transcript
(
[
st
.
session_state
.
videoLang
])
.
fetch
()
return
transcript
,
False
except
NoTranscriptFound
:
return
None
,
False
except
TranscriptsDisabled
:
except
TranscriptsDisabled
:
return
None
,
False
return
None
,
False
...
@@ -133,6 +147,12 @@ def transcriptManualToDoc(transcript, author, title, date):
...
@@ -133,6 +147,12 @@ def transcriptManualToDoc(transcript, author, title, date):
else
:
else
:
tmp
+=
text
+
" "
tmp
+=
text
+
" "
time
+=
float
(
part
[
"duration"
])
time
+=
float
(
part
[
"duration"
])
if
time
>=
20
:
tsv
=
tsvAdd
(
tsv
,
tmp
,
author
,
title
,
date
,
count
)
tmp
=
""
time
=
0
count
+=
1
with
open
(
st
.
session_state
.
zipDir
.
name
+
"/"
+
title
+
".tsv"
,
"w"
,
encoding
=
"utf-8-sig"
)
as
file
:
with
open
(
st
.
session_state
.
zipDir
.
name
+
"/"
+
title
+
".tsv"
,
"w"
,
encoding
=
"utf-8-sig"
)
as
file
:
file
.
write
(
tsv
)
file
.
write
(
tsv
)
tsv
=
"
\n
"
.
join
(
tsv
.
split
(
"
\n
"
)[
1
:])
tsv
=
"
\n
"
.
join
(
tsv
.
split
(
"
\n
"
)[
1
:])
...
@@ -170,15 +190,19 @@ def transcriptToTsv(search, nbVideos):
...
@@ -170,15 +190,19 @@ def transcriptToTsv(search, nbVideos):
dict
=
st
.
session_state
.
general_text_dict
dict
=
st
.
session_state
.
general_text_dict
with
st
.
spinner
(
dict
[
'loadingID'
]):
with
st
.
spinner
(
dict
[
'loadingID'
]):
if
st
.
session_state
.
manualOnly
:
if
st
.
session_state
.
manualOnly
:
videos
=
ytbSearch
(
search
,
nbVideos
*
20
)
videos
=
ytbSearch
(
search
,
nbVideos
*
15
)
else
:
else
:
videos
=
ytbSearch
(
search
,
nbVideos
*
4
)
videos
=
ytbSearch
(
search
,
nbVideos
*
5
)
count
=
0
count
=
0
count
Manu
al
=
0
count
Tot
al
=
0
bar
=
st
.
progress
(
count
/
nbVideos
,
dict
[
'loading'
]
+
bar
=
st
.
progress
(
count
/
nbVideos
,
dict
[
'loading'
]
+
str
(
count
)
+
dict
[
'quantity'
]
+
str
(
nbVideos
))
str
(
count
)
+
dict
[
'quantity'
]
+
str
(
nbVideos
))
for
video
in
videos
:
for
video
in
videos
:
print
(
count
)
countTotal
+=
1
waitingTime
=
random
.
uniform
(
2.0
,
7.0
)
# print("Waiting time : " + str(waitingTime))
time
.
sleep
(
waitingTime
)
# print(countTotal)
bar
.
progress
(
count
/
nbVideos
,
dict
[
'loading'
]
+
bar
.
progress
(
count
/
nbVideos
,
dict
[
'loading'
]
+
str
(
count
)
+
dict
[
'quantity'
]
+
str
(
nbVideos
))
str
(
count
)
+
dict
[
'quantity'
]
+
str
(
nbVideos
))
if
count
==
nbVideos
:
if
count
==
nbVideos
:
...
@@ -191,7 +215,6 @@ def transcriptToTsv(search, nbVideos):
...
@@ -191,7 +215,6 @@ def transcriptToTsv(search, nbVideos):
continue
continue
transcript
=
correctTranscript
(
transcript
)
transcript
=
correctTranscript
(
transcript
)
if
manual
:
if
manual
:
countManual
+=
1
tsv
+=
transcriptManualToDoc
(
transcript
,
tsv
+=
transcriptManualToDoc
(
transcript
,
author
,
title
,
str
(
date
.
today
()
.
year
))
author
,
title
,
str
(
date
.
today
()
.
year
))
count
+=
1
count
+=
1
...
@@ -226,14 +249,17 @@ def resetPage():
...
@@ -226,14 +249,17 @@ def resetPage():
def
askVideos
():
def
askVideos
():
with
st
.
form
(
"Submit"
):
with
st
.
form
(
"Submit"
):
st
.
write
(
st
.
session_state
.
general_text_dict
[
'text'
])
st
.
write
(
st
.
session_state
.
general_text_dict
[
'text'
])
st
.
write
(
st
.
session_state
.
general_text_dict
[
'text2'
])
st
.
selectbox
(
st
.
session_state
.
general_text_dict
[
'videoLang'
],
[
'fr'
,
'en'
],
key
=
'videoLang'
)
st
.
text_input
(
st
.
text_input
(
st
.
session_state
.
general_text_dict
[
'keywords'
],
key
=
'keywords'
)
st
.
session_state
.
general_text_dict
[
'keywords'
],
key
=
'keywords'
)
st
.
slider
(
st
.
slider
(
st
.
session_state
.
general_text_dict
[
'number'
],
1
,
3
0
,
key
=
'nb_taken'
)
st
.
session_state
.
general_text_dict
[
'number'
],
1
,
2
0
,
key
=
'nb_taken'
)
st
.
checkbox
(
st
.
checkbox
(
st
.
session_state
.
general_text_dict
[
'fill'
],
key
=
'manualOnly'
)
st
.
session_state
.
general_text_dict
[
'fill'
],
key
=
'manualOnly'
)
st
.
form_submit_button
(
st
.
form_submit_button
(
st
.
session_state
.
general_text_dict
[
'submit'
],
on_click
=
setSubmit
()
)
st
.
session_state
.
general_text_dict
[
'submit'
],
on_click
=
setSubmit
)
# Page Code End
# Page Code End
...
@@ -258,4 +284,4 @@ if st.session_state.page == 1:
...
@@ -258,4 +284,4 @@ if st.session_state.page == 1:
with
open
(
compilName
+
".zip"
,
'rb'
)
as
zip
:
with
open
(
compilName
+
".zip"
,
'rb'
)
as
zip
:
st
.
write
(
st
.
session_state
.
general_text_dict
[
'new_file'
])
st
.
write
(
st
.
session_state
.
general_text_dict
[
'new_file'
])
st
.
download_button
(
st
.
session_state
.
keywords
+
".zip"
,
st
.
download_button
(
st
.
session_state
.
keywords
+
".zip"
,
zip
,
st
.
session_state
.
keywords
+
".zip"
,
on_click
=
resetPage
()
)
zip
,
st
.
session_state
.
keywords
+
".zip"
,
on_click
=
resetPage
)
Streamlit/pages/Zotero_To_GarganText.py
View file @
2f362ef1
...
@@ -6,7 +6,7 @@ Loïc Chapron
...
@@ -6,7 +6,7 @@ Loïc Chapron
import
streamlit
as
st
import
streamlit
as
st
import
requests
as
req
import
requests
as
req
import
json
import
json
from
datetime
import
date
from
datetime
import
date
,
datetime
import
src.basic
as
tmp
import
src.basic
as
tmp
...
@@ -226,6 +226,7 @@ if st.session_state.stage == 0:
...
@@ -226,6 +226,7 @@ if st.session_state.stage == 0:
# Form
# Form
form
=
st
.
form
(
'api'
)
form
=
st
.
form
(
'api'
)
lst
=
[
'items'
,
'collections'
]
lst
=
[
'items'
,
'collections'
]
st
.
session_state
.
id
=
form
.
text_input
(
st
.
session_state
.
id
=
form
.
text_input
(
'ID'
,
st
.
session_state
.
id
,
key
=
'idForm'
,
help
=
st
.
session_state
.
general_text_dict
[
'help'
])
'ID'
,
st
.
session_state
.
id
,
key
=
'idForm'
,
help
=
st
.
session_state
.
general_text_dict
[
'help'
])
...
@@ -307,7 +308,8 @@ if st.session_state.stage == 2 and st.session_state.format == 'collections':
...
@@ -307,7 +308,8 @@ if st.session_state.stage == 2 and st.session_state.format == 'collections':
output
=
createTSVfromCollections
()
output
=
createTSVfromCollections
()
st
.
write
(
st
.
session_state
.
general_text_dict
[
'fileTSV1'
]
+
str
(
st
.
write
(
st
.
session_state
.
general_text_dict
[
'fileTSV1'
]
+
str
(
len
(
output
.
split
(
'
\n
'
))
-
2
)
+
st
.
session_state
.
general_text_dict
[
'fileTSV2'
])
len
(
output
.
split
(
'
\n
'
))
-
2
)
+
st
.
session_state
.
general_text_dict
[
'fileTSV2'
])
st
.
download_button
(
'Download TSV'
,
output
,
'output.csv'
)
fileName
=
"zoteroOutput_"
+
str
(
datetime
.
now
()
.
strftime
(
"
%
Y-
%
m-
%
d_
%
H:
%
M:
%
S"
))
+
'.csv'
st
.
download_button
(
'Download TSV'
,
output
,
fileName
)
if
st
.
session_state
.
stage
>
0
:
if
st
.
session_state
.
stage
>
0
:
...
...
Streamlit/src/basic.py
View file @
2f362ef1
...
@@ -5,6 +5,11 @@ from st_pages import show_pages_from_config, add_indentation
...
@@ -5,6 +5,11 @@ from st_pages import show_pages_from_config, add_indentation
def
base
(
page
):
def
base
(
page
):
st
.
set_page_config
(
page_title
=
"GarganTools | "
+
page
,
page_icon
=
"img/isc-pif_logo.png"
,
)
st
.
markdown
(
st
.
markdown
(
f
'''
f
'''
<style>
<style>
...
@@ -56,10 +61,11 @@ def base(page):
...
@@ -56,10 +61,11 @@ def base(page):
show_pages_from_config
()
show_pages_from_config
()
elif
st
.
session_state
.
general_session_page
!=
page
:
elif
st
.
session_state
.
general_session_page
!=
page
:
st
.
session_state
.
general_text_dict
=
load_bundle
(
st
.
session_state
.
general_language
)
st
.
session_state
.
general_text_dict
=
load_bundle
(
st
.
session_state
.
general_language
)
st
.
session_state
.
general_session_page
=
page
st
.
session_state
.
general_session_page
=
page
show_pages_from_config
()
show_pages_from_config
()
# Delete every key who aren't fron this file
# Delete every key who aren't fron this file
for
key
in
st
.
session_state
.
keys
():
for
key
in
st
.
session_state
.
keys
():
if
'general_'
not
in
key
:
if
'general_'
not
in
key
:
...
@@ -67,14 +73,12 @@ def base(page):
...
@@ -67,14 +73,12 @@ def base(page):
add_indentation
()
add_indentation
()
# select the lang
# select the lang
coltitle
,
col
=
st
.
columns
([
4
,
1
])
coltitle
,
col
=
st
.
columns
([
4
,
1
])
with
coltitle
:
with
coltitle
:
st
.
write
(
st
.
session_state
.
general_text_dict
[
'title'
])
st
.
write
(
st
.
session_state
.
general_text_dict
[
'title'
])
with
col
:
with
col
:
_
,
col1
,
col2
=
st
.
columns
([
1
,
1
,
1
])
_
,
col1
,
col2
=
st
.
columns
([
1
,
1
,
1
])
with
col1
:
with
col1
:
st
.
button
(
':fr:'
,
on_click
=
update_lang
,
args
=
(
'fr'
,))
st
.
button
(
':fr:'
,
on_click
=
update_lang
,
args
=
(
'fr'
,))
with
col2
:
with
col2
:
st
.
button
(
':us:'
,
on_click
=
update_lang
,
args
=
(
'en'
,))
st
.
button
(
':us:'
,
on_click
=
update_lang
,
args
=
(
'en'
,))
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment