Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
G
GarganTexternal tools
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
1
Merge Requests
1
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Anne-Laure Thomas Derepas
GarganTexternal tools
Commits
b6f15452
Commit
b6f15452
authored
Sep 11, 2023
by
Atrax Nicolas
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Update pages
parent
555164b4
Changes
13
Hide whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
280 additions
and
78 deletions
+280
-78
pages.toml
Streamlit/.streamlit/pages.toml
+2
-2
Homepage.py
Streamlit/Homepage.py
+3
-3
text_Homepage.csv
Streamlit/lang/text_Homepage.csv
+0
-0
text_PDFtoTSV.csv
Streamlit/lang/text_PDFtoTSV.csv
+2
-2
text_TXTtoTSV.csv
Streamlit/lang/text_TXTtoTSV.csv
+9
-6
CSV_Harzing_to_TSV.py
Streamlit/pages/CSV_Harzing_to_TSV.py
+1
-1
PDF_to_TSV.py
Streamlit/pages/PDF_to_TSV.py
+14
-10
PDF_to_TXT.py
Streamlit/pages/PDF_to_TXT.py
+2
-2
TSV_Translator.py
Streamlit/pages/TSV_Translator.py
+7
-5
TXT_to_TSV.py
Streamlit/pages/TXT_to_TSV.py
+183
-16
YTB_to_TSV.py
Streamlit/pages/YTB_to_TSV.py
+50
-24
Zotero_To_GarganText.py
Streamlit/pages/Zotero_To_GarganText.py
+1
-0
basic.py
Streamlit/src/basic.py
+6
-7
No files found.
Streamlit/.streamlit/pages.toml
View file @
b6f15452
[[pages]]
path
=
"
Welcom
e.py"
name
=
"Home"
path
=
"
Homepag
e.py"
name
=
"Home
page
"
icon
=
":house:"
[[pages]]
...
...
Streamlit/
Welcom
e.py
→
Streamlit/
Homepag
e.py
View file @
b6f15452
...
...
@@ -4,13 +4,13 @@ Loïc Chapron
"""
import
streamlit
as
st
import
src.basic
as
tmp
import
pandas
as
pd
import
src.basic
as
tmp
tmp
.
base
(
"
Welcom
e"
)
tmp
.
base
(
"
Homepag
e"
)
st
.
write
(
st
.
session_state
.
general_text_dict
[
'welcome'
])
st
.
write
(
st
.
session_state
.
general_text_dict
[
'tools'
])
st
.
write
(
st
.
session_state
.
general_text_dict
[
'code'
])
st
.
write
(
st
.
session_state
.
general_text_dict
[
'help'
])
Streamlit/lang/text_
Welcom
e.csv
→
Streamlit/lang/text_
Homepag
e.csv
View file @
b6f15452
File moved
Streamlit/lang/text_PDFtoTSV.csv
View file @
b6f15452
...
...
@@ -29,8 +29,8 @@ en,watermark,"Watermark : "
fr,submit," Soumettre "
en,submit,"Submit "
fr,loading," Conversion du
pdf
en cours "
en,loading," Processing
pdf
conversion "
fr,loading," Conversion du
PDF
en cours "
en,loading," Processing
PDF
conversion "
fr,warning,"Attention ! Plusieurs langues ont été détectées pour la source : "
fr,warning2,"Les langues suivantes ont été détectées : "
...
...
Streamlit/lang/text_TXTtoTSV.csv
View file @
b6f15452
...
...
@@ -5,11 +5,11 @@ en,title,"# TXT To TSV"
fr,text,"Convertit un fichier TXT en un fichier TSV compatible avec Gargantext"
en,text,"Convert a TXT file into a TSV file compatible with GarganText"
fr,text2,"C
et outil détecte automatiquement les langues présentes au sein des PDF à l'aide de l'API Google Translate.
"
en,text2,"
This tool detect automatically the languages of the PDF with the Google Translate API.
"
fr,text2,"C
onvertit un ZIP de fichiers TXT en fichiers TSV compatibles avec Gargantext
"
en,text2,"
Convert a ZIP of TXT files into TSV files compatible with GarganText
"
fr,text3,"Vous pouvez choisir le titre et le(s) auteur(s)
et indiquer, s'il existe, le filigrane de ce PDF
."
en,text3,"You can choose the title and the author(s)
and specify, if it does exist, the watermark for this PDF
."
fr,text3,"Vous pouvez choisir le titre et le(s) auteur(s)
de ce TXT
."
en,text3,"You can choose the title and the author(s)
for this TXT
."
fr,file,"Choisir un fichier"
en,file,"Choose a file"
...
...
@@ -20,12 +20,15 @@ en,new_file,"Download your TSV file : "
fr,author,"Auteur(s) : "
en,author,"Author(s) : "
fr,title
PDF
,"Titre : "
en,title
PDF
,"Title : "
fr,title
TXT
,"Titre : "
en,title
TXT
,"Title : "
fr,submit," Soumettre "
en,submit,"Submit "
fr,loading," Conversion du TXT en cours "
en,loading," Processing TXT conversion "
fr,warning,"Attention ! Plusieurs langues ont été détectées pour la source : "
fr,warning2,"Les langues suivantes ont été détectées : "
en,warning,"Warning ! Multiple languages have been detected at the source : "
...
...
Streamlit/pages/CSV_Harzing_to_TSV.py
View file @
b6f15452
...
...
@@ -141,4 +141,4 @@ if st.session_state.page == 1:
tsv
=
HarzingToTsv
(
separator
)
name
=
st
.
session_state
.
file
.
name
.
split
(
'.'
)[
0
]
+
'.tsv'
st
.
write
(
st
.
session_state
.
general_text_dict
[
'new_file'
])
st
.
download_button
(
name
,
tsv
,
name
,
on_click
=
resetPage
()
)
st
.
download_button
(
name
,
tsv
,
name
,
on_click
=
resetPage
)
Streamlit/pages/PDF_to_TSV.py
View file @
b6f15452
...
...
@@ -151,7 +151,7 @@ def segmentAbstract(fileName, fileAddress, tsv, author, source, year, month, day
languages
=
detectLanguages
(
doc
,
languages
)
st
.
session_state
.
pdfLanguages
[
fileName
]
=
detectMultipleLanguages
(
languages
,
fileName
)
return
tsv
,
languages
return
tsv
def
correctedSequence
(
text
,
last
):
...
...
@@ -173,21 +173,21 @@ def getInfo():
return
st
.
session_state
.
author
,
title
,
st
.
session_state
.
watermark
def
txt
ToTSV
(
fileName
,
fileAddress
,
pdfDir
):
def
pdf
ToTSV
(
fileName
,
fileAddress
,
pdfDir
):
st
.
session_state
.
page
=
1
author
,
title
,
watermark
=
getInfo
()
tsv
=
"authors
\t
source
\t
publication_year
\t
publication_month
\t
publication_day
\t
title
\t
abstract
\n
"
with
st
.
spinner
(
st
.
session_state
.
general_text_dict
[
'loading'
]):
tsv
,
languages
=
segmentAbstract
(
fileName
,
fileAddress
,
tsv
,
author
,
title
,
str
(
date
.
today
()
.
year
),
"1"
,
"1"
,
watermark
)
tsv
=
segmentAbstract
(
fileName
,
fileAddress
,
tsv
,
author
,
title
,
str
(
date
.
today
()
.
year
),
"1"
,
"1"
,
watermark
)
if
'/'
in
fileName
:
fileName
=
fileName
.
split
(
'/'
)[
1
]
with
open
(
pdfDir
+
"/"
+
fileName
.
replace
(
".pdf"
,
".tsv"
),
"w"
,
encoding
=
"utf-8-sig"
)
as
file
:
with
open
(
pdfDir
+
"/"
+
fileName
.
replace
(
".pdf"
,
"
(pdf)
.tsv"
),
"w"
,
encoding
=
"utf-8-sig"
)
as
file
:
file
.
write
(
tsv
)
tsv
=
"
\n
"
.
join
(
tsv
.
split
(
"
\n
"
)[
1
:])
return
tsv
,
languages
return
tsv
def
extractAllPDF
(
zipDir
,
zipFile
):
...
...
@@ -223,6 +223,10 @@ def setSubmit():
st
.
session_state
.
submit
=
True
def
resetPage
():
st
.
session_state
.
page
=
0
def
upPage
():
st
.
session_state
.
page
=
2
...
...
@@ -234,7 +238,7 @@ def uploadZip():
st
.
file_uploader
(
st
.
session_state
.
general_text_dict
[
'file'
],
type
=
[
"zip"
],
key
=
'file'
)
st
.
form_submit_button
(
st
.
session_state
.
general_text_dict
[
'submit'
],
on_click
=
setSubmit
()
)
st
.
session_state
.
general_text_dict
[
'submit'
],
on_click
=
setSubmit
)
def
askPDF
(
fileName
):
...
...
@@ -254,7 +258,7 @@ def askPDF(fileName):
st
.
text_input
(
st
.
session_state
.
general_text_dict
[
'watermark'
],
key
=
'watermark'
)
st
.
form_submit_button
(
st
.
session_state
.
general_text_dict
[
'submit'
],
on_click
=
upPage
()
)
st
.
session_state
.
general_text_dict
[
'submit'
],
on_click
=
upPage
)
# Page Code End
...
...
@@ -279,7 +283,7 @@ if st.session_state.page == 0:
if
st
.
session_state
.
page
==
2
:
fileName
=
st
.
session_state
.
fileName
tmp
,
languages
=
txt
ToTSV
(
tmp
=
pdf
ToTSV
(
fileName
,
st
.
session_state
.
zipDir
.
name
+
'/'
+
fileName
,
st
.
session_state
.
pdfDir
.
name
)
st
.
session_state
.
tsv
+=
"
\n
"
+
tmp
if
st
.
session_state
.
nbDoc
==
st
.
session_state
.
len
-
1
:
...
...
@@ -306,4 +310,4 @@ if st.session_state.page == 3:
detectMultiplePdfLanguages
()
st
.
write
(
st
.
session_state
.
general_text_dict
[
'new_file'
])
st
.
download_button
(
"PDFCompilation.zip"
,
zip
,
"PDFCompilation.zip"
)
zip
,
"PDFCompilation.zip"
,
on_click
=
resetPage
)
Streamlit/pages/PDF_to_TXT.py
View file @
b6f15452
...
...
@@ -97,7 +97,7 @@ def askPDF():
st
.
file_uploader
(
st
.
session_state
.
general_text_dict
[
'file'
],
type
=
[
"pdf"
],
key
=
'file'
)
st
.
form_submit_button
(
st
.
session_state
.
general_text_dict
[
'submit'
],
on_click
=
setSubmit
()
)
st
.
session_state
.
general_text_dict
[
'submit'
],
on_click
=
setSubmit
)
# Page Code End
...
...
@@ -124,4 +124,4 @@ if st.session_state.page == 1:
name
=
st
.
session_state
.
file
.
name
.
split
(
'.'
)[
0
]
+
'.txt'
st
.
write
(
st
.
session_state
.
general_text_dict
[
'new_file'
])
st
.
session_state
.
submit
=
False
st
.
download_button
(
name
,
txt
,
name
,
on_click
=
setPage
()
)
st
.
download_button
(
name
,
txt
,
name
,
on_click
=
setPage
)
Streamlit/pages/TSV_Translator.py
View file @
b6f15452
...
...
@@ -43,12 +43,13 @@ def estimateLanguagesPercentage(languages):
for
l
in
languages
:
total
+=
languages
[
l
]
for
l
in
languages
:
tmp
=
(
languages
[
l
]
/
total
)
*
100
tmp
=
round
((
languages
[
l
]
/
total
)
*
100
,
1
)
if
tmp
>=
15
:
res
[
l
]
=
tmp
if
st
.
session_state
.
detected
!=
""
:
st
.
session_state
.
detected
+=
"| "
st
.
session_state
.
detected
+=
l
+
" : "
+
str
(
tmp
)
+
"
%
"
print
(
res
)
return
res
...
...
@@ -182,10 +183,11 @@ def uploadTSV():
with
st
.
form
(
"Detect"
):
st
.
write
(
st
.
session_state
.
general_text_dict
[
'text'
])
st
.
write
(
st
.
session_state
.
general_text_dict
[
'text2'
])
st
.
file_uploader
(
st
.
session_state
.
general_text_dict
[
'file'
],
type
=
[
"tsv"
,
"csv"
],
key
=
'file'
)
st
.
form_submit_button
(
st
.
session_state
.
general_text_dict
[
'detect'
],
on_click
=
setDetect
()
)
st
.
session_state
.
general_text_dict
[
'detect'
],
on_click
=
setDetect
)
def
askTranslateLanguages
(
file
):
...
...
@@ -202,7 +204,7 @@ def askTranslateLanguages(file):
st
.
selectbox
(
st
.
session_state
.
general_text_dict
[
'translate2'
],
st
.
session_state
.
languages
,
key
=
'destLang'
)
st
.
form_submit_button
(
st
.
session_state
.
general_text_dict
[
'submit'
],
on_click
=
setSubmit
()
)
st
.
session_state
.
general_text_dict
[
'submit'
],
on_click
=
setSubmit
)
# Page Code End
...
...
@@ -239,11 +241,11 @@ if st.session_state.page == 2:
st
.
write
(
st
.
session_state
.
general_text_dict
[
'new_file'
])
name
=
st
.
session_state
.
tmpFile
.
name
st
.
download_button
(
name
,
tsv
,
name
,
on_click
=
resetPage
()
)
tsv
,
name
,
on_click
=
resetPage
)
if
st
.
session_state
.
page
==
3
:
st
.
write
(
st
.
session_state
.
general_text_dict
[
'sameLanguages'
]
+
list
(
st
.
session_state
.
languages
.
keys
())[
0
])
st
.
session_state
.
languages
=
{}
st
.
button
(
st
.
session_state
.
general_text_dict
[
'anotherFile'
],
on_click
=
resetPage
()
)
st
.
session_state
.
general_text_dict
[
'anotherFile'
],
on_click
=
resetPage
)
Streamlit/pages/TXT_to_TSV.py
View file @
b6f15452
...
...
@@ -4,13 +4,12 @@ Nicolas Atrax
"""
import
streamlit
as
st
import
pandas
as
pd
import
chardet
import
zipfile
import
tempfile
import
shutil
import
os
import
re
from
datetime
import
date
import
codecs
import
os
import
tempfile
from
lib.langdetect.langdetect
import
detect
from
lib.langdetect.langdetect.lang_detect_exception
import
LangDetectException
import
src.basic
as
tmp
...
...
@@ -70,6 +69,18 @@ def detectMultipleLanguages(languages, fileName):
return
principal
def
detectMultipleTxtLanguages
():
languages
=
[]
for
l
in
st
.
session_state
.
txtLanguages
.
values
():
if
l
not
in
languages
and
len
(
languages
)
==
1
:
st
.
info
(
st
.
session_state
.
general_text_dict
[
'globalWarning'
])
st
.
info
(
str
(
st
.
session_state
.
txtLanguages
))
st
.
info
(
st
.
session_state
.
general_text_dict
[
'advice'
])
return
if
len
(
languages
)
==
0
:
languages
.
append
(
l
)
def
segmentAbstract
(
abstract
,
tsv
):
year
=
str
(
date
.
today
()
.
year
)
month
=
"1"
...
...
@@ -101,6 +112,35 @@ def segmentAbstract(abstract, tsv):
return
tsv
def
segmentAbstract2
(
abstract
,
tsv
,
author
,
title
):
year
=
str
(
date
.
today
()
.
year
)
month
=
"1"
day
=
"1"
source
=
title
nbLines
=
len
(
abstract
)
n
=
0
count
=
1
languages
=
{}
while
n
<
nbLines
-
2
:
doc
=
""
.
join
(
abstract
[
n
:
n
+
9
])
.
replace
(
"�"
,
""
)
title
=
source
+
" : Part "
+
str
(
count
)
tsv
+=
correctedSequence
(
author
,
False
)
+
"
\t
"
+
correctedSequence
(
source
,
False
)
+
"
\t
"
+
year
+
"
\t
"
+
month
+
"
\t
"
+
day
+
"
\t
"
tsv
+=
correctedSequence
(
title
,
False
)
+
"
\t
"
tsv
+=
correctedSequence
(
doc
,
True
)
if
tsv
[
-
1
]
!=
"
\n
"
:
tsv
+=
"
\n
"
n
+=
7
count
+=
1
if
n
>
nbLines
-
9
and
n
!=
nbLines
-
2
:
n
=
nbLines
-
9
languages
=
detectLanguages
(
doc
,
languages
)
st
.
session_state
.
txtLanguages
[
fileName
]
=
detectMultipleLanguages
(
languages
,
source
)
return
tsv
def
correctedSequence
(
text
,
last
):
tmp
=
text
.
replace
(
"
\"
"
,
"
\"\"
"
)
find
=
"
\t
"
in
text
or
"
\"
"
in
text
or
"
\n
"
in
text
...
...
@@ -115,16 +155,31 @@ def correctedSequence(text, last):
def
getTxt
():
txt
=
[]
st
.
session_state
.
pdf
Dir
=
tempfile
.
TemporaryDirectory
()
st
.
session_state
.
tmp
Dir
=
tempfile
.
TemporaryDirectory
()
name
=
st
.
session_state
.
file
.
name
with
open
(
st
.
session_state
.
pdf
Dir
.
name
+
"/"
+
name
,
"wb"
)
as
file
:
with
open
(
st
.
session_state
.
tmp
Dir
.
name
+
"/"
+
name
,
"wb"
)
as
file
:
file
.
write
(
st
.
session_state
.
file
.
getvalue
())
with
open
(
st
.
session_state
.
pdf
Dir
.
name
+
"/"
+
name
,
"r"
)
as
file
:
with
open
(
st
.
session_state
.
tmp
Dir
.
name
+
"/"
+
name
,
"r"
)
as
file
:
for
line
in
file
:
txt
.
append
(
line
)
return
txt
def
getTxt2
(
fileAddress
):
txt
=
[]
with
open
(
fileAddress
,
"r"
)
as
file
:
for
line
in
file
:
txt
.
append
(
line
)
return
txt
def
getInfo
():
title
=
st
.
session_state
.
title
if
title
==
""
:
title
=
st
.
session_state
.
fileName
.
replace
(
".txt"
,
""
)
return
st
.
session_state
.
author
,
title
def
txtToTSV
():
fileName
=
st
.
session_state
.
file
.
name
...
...
@@ -135,6 +190,32 @@ def txtToTSV():
return
tsv
def
txtToTSV2
(
fileName
,
fileAddress
,
txtDir
):
st
.
session_state
.
page
=
1
author
,
title
=
getInfo
()
abstract
=
getTxt2
(
fileAddress
)
tsv
=
"authors
\t
source
\t
publication_year
\t
publication_month
\t
publication_day
\t
title
\t
abstract
\n
"
with
st
.
spinner
(
st
.
session_state
.
general_text_dict
[
'loading'
]):
tsv
=
segmentAbstract2
(
abstract
,
tsv
,
author
,
title
)
if
'/'
in
fileName
:
fileName
=
fileName
.
split
(
'/'
)[
1
]
with
open
(
txtDir
+
"/"
+
fileName
.
replace
(
".txt"
,
"(txt).tsv"
),
"w"
,
encoding
=
"utf-8-sig"
)
as
file
:
file
.
write
(
tsv
)
tsv
=
"
\n
"
.
join
(
tsv
.
split
(
"
\n
"
)[
1
:])
return
tsv
def
extractAllTXT
(
zipDir
,
zipFile
):
with
zipfile
.
ZipFile
(
zipFile
)
as
zipRef
:
zipInfos
=
zipRef
.
infolist
()
for
info
in
zipInfos
:
while
'/'
in
info
.
filename
and
len
(
info
.
filename
.
split
(
'/'
))
>
1
:
info
.
filename
=
"/"
.
join
(
info
.
filename
.
split
(
'/'
)[
1
:])
if
".txt"
in
info
.
filename
:
zipRef
.
extract
(
info
,
zipDir
)
# Tool Code End
...
...
@@ -148,20 +229,45 @@ if 'page' not in st.session_state:
if
'warning'
not
in
st
.
session_state
:
st
.
session_state
.
warning
=
""
if
'submit'
not
in
st
.
session_state
:
st
.
session_state
.
submit
=
False
if
'zipSubmit'
not
in
st
.
session_state
:
st
.
session_state
.
zipSubmit
=
False
if
'txtLanguages'
not
in
st
.
session_state
:
st
.
session_state
.
txtLanguages
=
{}
def
setSubmit
():
st
.
session_state
.
submit
=
True
def
setPage
():
def
setZIPSubmit
():
st
.
session_state
.
zipSubmit
=
True
def
resetPage
():
st
.
session_state
.
page
=
0
def
upPage
():
st
.
session_state
.
page
=
3
def
uploadZip
():
with
st
.
form
(
"Submit2"
):
st
.
write
(
st
.
session_state
.
general_text_dict
[
'text2'
])
st
.
file_uploader
(
st
.
session_state
.
general_text_dict
[
'file'
],
type
=
[
"zip"
],
key
=
'zipFile'
)
st
.
form_submit_button
(
st
.
session_state
.
general_text_dict
[
'submit'
],
on_click
=
setZIPSubmit
)
def
askTXT
():
with
st
.
form
(
"Submit"
):
st
.
write
(
st
.
session_state
.
general_text_dict
[
'text'
])
st
.
write
(
st
.
session_state
.
general_text_dict
[
'text2'
])
st
.
write
(
st
.
session_state
.
general_text_dict
[
'text3'
])
col1
,
col2
=
st
.
columns
(
2
)
st
.
session_state
.
author
=
""
...
...
@@ -170,28 +276,58 @@ def askTXT():
st
.
session_state
.
general_text_dict
[
'author'
],
key
=
'author'
)
with
col2
:
st
.
text_input
(
st
.
session_state
.
general_text_dict
[
'title
PDF
'
],
key
=
'title'
)
st
.
session_state
.
general_text_dict
[
'title
TXT
'
],
key
=
'title'
)
st
.
file_uploader
(
st
.
session_state
.
general_text_dict
[
'file'
],
type
=
[
"txt"
],
key
=
'file'
)
st
.
form_submit_button
(
st
.
session_state
.
general_text_dict
[
'submit'
],
on_click
=
setSubmit
())
st
.
session_state
.
general_text_dict
[
'submit'
],
on_click
=
setSubmit
)
def
askTXT2
(
fileName
):
with
st
.
form
(
"Submit"
):
st
.
write
(
fileName
)
st
.
write
(
st
.
session_state
.
general_text_dict
[
'text3'
])
col1
,
col2
=
st
.
columns
(
2
)
st
.
session_state
.
author
=
""
with
col1
:
st
.
text_input
(
st
.
session_state
.
general_text_dict
[
'author'
],
key
=
'author'
)
with
col2
:
st
.
text_input
(
st
.
session_state
.
general_text_dict
[
'titleTXT'
],
key
=
'title'
)
st
.
form_submit_button
(
st
.
session_state
.
general_text_dict
[
'submit'
],
on_click
=
upPage
)
# Page Code End
if
'submit'
not
in
st
.
session_state
:
st
.
session_state
.
submit
=
False
if
st
.
session_state
.
page
==
0
:
if
st
.
session_state
.
submit
:
st
.
session_state
.
submit
=
False
if
st
.
session_state
.
file
!=
None
:
print
(
st
.
session_state
.
file
)
st
.
session_state
.
page
=
1
else
:
askTXT
()
uploadZip
()
elif
st
.
session_state
.
zipSubmit
:
st
.
session_state
.
zipSubmit
=
False
if
st
.
session_state
.
zipFile
!=
None
:
st
.
session_state
.
zipDir
=
tempfile
.
TemporaryDirectory
()
st
.
session_state
.
txtDir
=
tempfile
.
TemporaryDirectory
()
st
.
session_state
.
tsv
=
"authors
\t
source
\t
publication_year
\t
publication_month
\t
publication_day
\t
title
\t
abstract"
extractAllTXT
(
st
.
session_state
.
zipDir
.
name
,
st
.
session_state
.
zipFile
)
st
.
session_state
.
len
=
len
(
os
.
listdir
(
st
.
session_state
.
zipDir
.
name
))
st
.
session_state
.
nbDoc
=
0
st
.
session_state
.
page
=
2
else
:
askTXT
()
uploadZip
()
else
:
askTXT
()
uploadZip
()
if
st
.
session_state
.
page
==
1
:
name
=
st
.
session_state
.
file
.
name
...
...
@@ -201,4 +337,35 @@ if st.session_state.page == 1:
st
.
session_state
.
submit
=
False
if
st
.
session_state
.
warning
!=
""
:
st
.
info
(
st
.
session_state
.
warning
)
st
.
download_button
(
name
,
txt
,
name
,
on_click
=
setPage
())
st
.
download_button
(
name
,
txt
,
name
,
on_click
=
resetPage
)
if
st
.
session_state
.
page
==
3
:
fileName
=
st
.
session_state
.
fileName
tmp
=
txtToTSV2
(
fileName
,
st
.
session_state
.
zipDir
.
name
+
'/'
+
fileName
,
st
.
session_state
.
txtDir
.
name
)
st
.
session_state
.
tsv
+=
"
\n
"
+
tmp
if
st
.
session_state
.
nbDoc
==
st
.
session_state
.
len
-
1
:
st
.
session_state
.
page
=
4
else
:
st
.
session_state
.
nbDoc
+=
1
st
.
session_state
.
page
=
2
if
st
.
session_state
.
page
==
2
:
fileName
=
os
.
listdir
(
st
.
session_state
.
zipDir
.
name
)[
st
.
session_state
.
nbDoc
]
st
.
session_state
.
fileName
=
fileName
if
'/'
in
fileName
:
fileName
=
fileName
.
split
(
'/'
)[
1
]
askTXT2
(
fileName
)
if
st
.
session_state
.
page
==
4
:
with
open
(
st
.
session_state
.
txtDir
.
name
+
"/TXTCompilation.tsv"
,
"w"
,
encoding
=
'utf-8-sig'
)
as
file
:
file
.
write
(
st
.
session_state
.
tsv
)
shutil
.
make_archive
(
st
.
session_state
.
zipDir
.
name
+
"/TXTCompilation"
,
'zip'
,
st
.
session_state
.
txtDir
.
name
)
with
open
(
st
.
session_state
.
zipDir
.
name
+
"/TXTCompilation.zip"
,
'rb'
)
as
zip
:
if
st
.
session_state
.
warning
!=
""
:
st
.
info
(
st
.
session_state
.
warning
)
detectMultipleTxtLanguages
()
st
.
write
(
st
.
session_state
.
general_text_dict
[
'new_file'
])
st
.
download_button
(
"TXTCompilation.zip"
,
zip
,
"TXTCompilation.zip"
,
on_click
=
resetPage
)
Streamlit/pages/YTB_to_TSV.py
View file @
b6f15452
...
...
@@ -4,11 +4,6 @@ Nicolas Atrax
"""
import
streamlit
as
st
import
pandas
as
pd
import
chardet
import
re
import
codecs
import
os
import
tempfile
import
shutil
from
datetime
import
date
...
...
@@ -17,6 +12,8 @@ from lib.youtubetranscript.youtube_transcript_api import YouTubeTranscriptApi
from
lib.youtubetranscript.youtube_transcript_api._transcripts
import
NoTranscriptFound
from
lib.youtubetranscript.youtube_transcript_api._transcripts
import
TranscriptsDisabled
import
src.basic
as
tmp
import
time
import
random
tmp
.
base
(
"YTBtoTSV"
)
...
...
@@ -24,7 +21,11 @@ tmp.base("YTBtoTSV")
def
ytbSearch
(
search
,
n
):
videosSearch
=
VideosSearch
(
search
)
if
st
.
session_state
.
videoLang
==
'fr'
:
region
=
'FR'
else
:
region
=
'US'
videosSearch
=
VideosSearch
(
search
,
region
=
region
)
result
=
videosSearch
.
result
()[
"result"
]
videos
=
[]
while
len
(
videos
)
<
n
:
...
...
@@ -35,8 +36,11 @@ def ytbSearch(search, n):
videos
.
append
([
id
,
author
,
title
])
if
len
(
videos
)
==
n
:
break
if
len
(
videos
)
==
n
:
break
tmpResult
=
result
videosSearch
.
next
()
time
.
sleep
(
1.0
)
result
=
videosSearch
.
result
()[
"result"
]
if
result
==
tmpResult
:
break
...
...
@@ -44,27 +48,37 @@ def ytbSearch(search, n):
def
getLang
(
list
):
tmp
=
""
for
lang
in
list
:
return
str
(
lang
)
.
split
(
" "
)[
0
]
tmp
=
str
(
lang
)
.
split
(
" "
)[
0
]
if
tmp
==
st
.
session_state
.
videoLang
:
break
return
tmp
def
translatedTranscript
(
lang
,
lst
,
title
,
manual
):
if
lang
!=
"en"
:
res
=
lst
.
find_transcript
([
lang
])
trans
=
res
.
translate
(
"en"
)
.
fetch
()
return
trans
return
lst
.
find_transcript
([
lang
])
.
fetch
()
def
translateTranscript
(
lst
,
lang
):
origin
=
lst
.
find_transcript
([
lang
])
manual
=
not
origin
.
is_generated
return
origin
.
translate
(
st
.
session_state
.
videoLang
)
.
fetch
(),
manual
def
ytbTranscript
(
id
,
title
):
try
:
transcriptList
=
YouTubeTranscriptApi
.
list_transcripts
(
id
)
lang
=
getLang
(
transcriptList
)
if
lang
!=
st
.
session_state
.
videoLang
:
return
translateTranscript
(
transcriptList
,
lang
)
try
:
transcriptList
.
find_manually_created_transcript
([
lang
])
return
translatedTranscript
(
lang
,
transcriptList
,
title
,
True
),
True
transcript
=
transcriptList
.
find_manually_created_transcript
(
[
st
.
session_state
.
videoLang
])
.
fetch
()
return
transcript
,
True
except
NoTranscriptFound
:
return
translatedTranscript
(
lang
,
transcriptList
,
title
,
False
),
False
try
:
transcript
=
transcriptList
.
find_generated_transcript
(
[
st
.
session_state
.
videoLang
])
.
fetch
()
return
transcript
,
False
except
NoTranscriptFound
:
return
None
,
False
except
TranscriptsDisabled
:
return
None
,
False
...
...
@@ -133,6 +147,12 @@ def transcriptManualToDoc(transcript, author, title, date):
else
:
tmp
+=
text
+
" "
time
+=
float
(
part
[
"duration"
])
if
time
>=
20
:
tsv
=
tsvAdd
(
tsv
,
tmp
,
author
,
title
,
date
,
count
)
tmp
=
""
time
=
0
count
+=
1
with
open
(
st
.
session_state
.
zipDir
.
name
+
"/"
+
title
+
".tsv"
,
"w"
,
encoding
=
"utf-8-sig"
)
as
file
:
file
.
write
(
tsv
)
tsv
=
"
\n
"
.
join
(
tsv
.
split
(
"
\n
"
)[
1
:])
...
...
@@ -170,15 +190,19 @@ def transcriptToTsv(search, nbVideos):
dict
=
st
.
session_state
.
general_text_dict
with
st
.
spinner
(
dict
[
'loadingID'
]):
if
st
.
session_state
.
manualOnly
:
videos
=
ytbSearch
(
search
,
nbVideos
*
20
)
videos
=
ytbSearch
(
search
,
nbVideos
*
15
)
else
:
videos
=
ytbSearch
(
search
,
nbVideos
*
4
)
videos
=
ytbSearch
(
search
,
nbVideos
*
5
)
count
=
0
count
Manu
al
=
0
count
Tot
al
=
0
bar
=
st
.
progress
(
count
/
nbVideos
,
dict
[
'loading'
]
+
str
(
count
)
+
dict
[
'quantity'
]
+
str
(
nbVideos
))
for
video
in
videos
:
print
(
count
)
countTotal
+=
1
waitingTime
=
random
.
uniform
(
2.0
,
7.0
)
# print("Waiting time : " + str(waitingTime))
time
.
sleep
(
waitingTime
)
# print(countTotal)
bar
.
progress
(
count
/
nbVideos
,
dict
[
'loading'
]
+
str
(
count
)
+
dict
[
'quantity'
]
+
str
(
nbVideos
))
if
count
==
nbVideos
:
...
...
@@ -191,7 +215,6 @@ def transcriptToTsv(search, nbVideos):
continue
transcript
=
correctTranscript
(
transcript
)
if
manual
:
countManual
+=
1
tsv
+=
transcriptManualToDoc
(
transcript
,
author
,
title
,
str
(
date
.
today
()
.
year
))
count
+=
1
...
...
@@ -226,14 +249,17 @@ def resetPage():
def
askVideos
():
with
st
.
form
(
"Submit"
):
st
.
write
(
st
.
session_state
.
general_text_dict
[
'text'
])
st
.
write
(
st
.
session_state
.
general_text_dict
[
'text2'
])
st
.
selectbox
(
st
.
session_state
.
general_text_dict
[
'videoLang'
],
[
'fr'
,
'en'
],
key
=
'videoLang'
)
st
.
text_input
(
st
.
session_state
.
general_text_dict
[
'keywords'
],
key
=
'keywords'
)
st
.
slider
(
st
.
session_state
.
general_text_dict
[
'number'
],
1
,
3
0
,
key
=
'nb_taken'
)
st
.
session_state
.
general_text_dict
[
'number'
],
1
,
2
0
,
key
=
'nb_taken'
)
st
.
checkbox
(
st
.
session_state
.
general_text_dict
[
'fill'
],
key
=
'manualOnly'
)
st
.
form_submit_button
(
st
.
session_state
.
general_text_dict
[
'submit'
],
on_click
=
setSubmit
()
)
st
.
session_state
.
general_text_dict
[
'submit'
],
on_click
=
setSubmit
)
# Page Code End
...
...
@@ -258,4 +284,4 @@ if st.session_state.page == 1:
with
open
(
compilName
+
".zip"
,
'rb'
)
as
zip
:
st
.
write
(
st
.
session_state
.
general_text_dict
[
'new_file'
])
st
.
download_button
(
st
.
session_state
.
keywords
+
".zip"
,
zip
,
st
.
session_state
.
keywords
+
".zip"
,
on_click
=
resetPage
()
)
zip
,
st
.
session_state
.
keywords
+
".zip"
,
on_click
=
resetPage
)
Streamlit/pages/Zotero_To_GarganText.py
View file @
b6f15452
...
...
@@ -226,6 +226,7 @@ if st.session_state.stage == 0:
# Form
form
=
st
.
form
(
'api'
)
lst
=
[
'items'
,
'collections'
]
st
.
session_state
.
id
=
form
.
text_input
(
'ID'
,
st
.
session_state
.
id
,
key
=
'idForm'
,
help
=
st
.
session_state
.
general_text_dict
[
'help'
])
...
...
Streamlit/src/basic.py
View file @
b6f15452
...
...
@@ -6,7 +6,7 @@ from st_pages import show_pages_from_config, add_indentation
def
base
(
page
):
st
.
set_page_config
(
page_title
=
"GarganTools "
+
page
,
page_title
=
"GarganTools
|
"
+
page
,
page_icon
=
"img/isc-pif_logo.png"
,
)
...
...
@@ -61,10 +61,11 @@ def base(page):
show_pages_from_config
()
elif
st
.
session_state
.
general_session_page
!=
page
:
st
.
session_state
.
general_text_dict
=
load_bundle
(
st
.
session_state
.
general_language
)
st
.
session_state
.
general_text_dict
=
load_bundle
(
st
.
session_state
.
general_language
)
st
.
session_state
.
general_session_page
=
page
show_pages_from_config
()
# Delete every key who aren't fron this file
for
key
in
st
.
session_state
.
keys
():
if
'general_'
not
in
key
:
...
...
@@ -72,14 +73,12 @@ def base(page):
add_indentation
()
# select the lang
coltitle
,
col
=
st
.
columns
([
4
,
1
])
coltitle
,
col
=
st
.
columns
([
4
,
1
])
with
coltitle
:
st
.
write
(
st
.
session_state
.
general_text_dict
[
'title'
])
with
col
:
_
,
col1
,
col2
=
st
.
columns
([
1
,
1
,
1
])
_
,
col1
,
col2
=
st
.
columns
([
1
,
1
,
1
])
with
col1
:
st
.
button
(
':fr:'
,
on_click
=
update_lang
,
args
=
(
'fr'
,))
with
col2
:
st
.
button
(
':us:'
,
on_click
=
update_lang
,
args
=
(
'en'
,))
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment