Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
G
GarganTexternal tools
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Julien Moutinho
GarganTexternal tools
Commits
ff89fe15
Commit
ff89fe15
authored
Aug 31, 2023
by
Atrax Nicolas
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Update pages
parent
1c63845c
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
27 additions
and
38 deletions
+27
-38
text_PDFtoTSV.csv
Streamlit/lang/text_PDFtoTSV.csv
+3
-3
PDF_to_TSV.py
Streamlit/pages/PDF_to_TSV.py
+16
-32
PDF_to_TXT.py
Streamlit/pages/PDF_to_TXT.py
+2
-2
TXT_to_TSV.py
Streamlit/pages/TXT_to_TSV.py
+6
-1
No files found.
Streamlit/lang/text_PDFtoTSV.csv
View file @
ff89fe15
...
...
@@ -29,11 +29,11 @@ en,warning,"Warning ! Multiple languages have been detected at the source : "
en,warning2,"The following languages have been detected : "
fr,globalWarning, "Attention ! Plusieurs langues ont été détectées entre vos pdf !
Les langues suivantes ont été détectées : "
en,globalWarning,"Warning ! Multiple languages have been detected for your pdfs file !
The following languages have been detected : "
fr,globalWarning, "Attention ! Plusieurs langues ont été détectées entre vos pdf !
\n
Les langues suivantes ont été détectées : "
en,globalWarning,"Warning ! Multiple languages have been detected for your pdfs file !
\n
The following languages have been detected : "
fr,advice,"Cela pourrait affecter massivement l'analyse de GarganText.
Vous pouvez régler ça en traduisant avec l'outil TsvTranslator."
fr,advice,"Cela pourrait affecter massivement l'analyse de GarganText.
\n
Vous pouvez régler ça en traduisant avec l'outil TsvTranslator."
en,advice,"This could massively affect the analysis of Gargantext.\nYou can correct this by translation with the TsvTranslator tool."
Streamlit/pages/PDF_to_TSV.py
View file @
ff89fe15
...
...
@@ -3,22 +3,23 @@ Streamlit Application
Nicolas Atrax
"""
import
streamlit
as
st
import
pandas
as
pd
import
chardet
import
re
from
datetime
import
date
import
codecs
import
os
import
tempfile
import
zipfile
import
shutil
import
zipfile
import
tempfile
import
os
import
codecs
from
datetime
import
date
import
re
import
chardet
import
pandas
as
pd
import
streamlit
as
st
from
lib.tika.tika
import
parser
from
lib.langdetect.langdetect
import
detect
from
lib.langdetect.langdetect.lang_detect_exception
import
LangDetectException
import
src.basic
as
tmp
os
.
environ
[
'TIKA_SERVER_JAR'
]
=
'https://repo1.maven.org/maven2/org/apache/tika/tika-server/1.19/tika-server-1.19.jar'
tmp
.
base
(
"PDFtoTSV"
)
# Tool Code Start
...
...
@@ -83,7 +84,7 @@ def detectMultipleLanguages(languages, fileName):
detected
+=
l
+
" : "
+
str
(
languages
[
l
])
+
"
%
"
valuable
.
append
(
l
)
if
len
(
valuable
)
>
1
:
st
.
session_state
.
warning
+=
st
.
session_state
.
general_text_dict
[
'warning'
]
+
\
st
.
session_state
.
warning
+=
st
.
session_state
.
general_text_dict
[
'warning'
]
+
"
\"
"
+
\
fileName
+
"
\"
!
\n
"
st
.
session_state
.
warning
+=
st
.
session_state
.
general_text_dict
[
'warning2'
]
+
\
detected
+
"
\n
"
...
...
@@ -96,9 +97,9 @@ def detectMultiplePdfLanguages():
languages
=
[]
for
l
in
st
.
session_state
.
pdfLanguages
.
values
():
if
l
not
in
languages
and
len
(
languages
)
==
1
:
st
.
write
(
st
.
session_state
.
general_text_dict
[
'globalWarning'
])
st
.
write
(
str
(
st
.
session_state
.
pdfLanguages
))
st
.
write
(
st
.
session_state
.
general_text_dict
[
'advice'
])
st
.
error
(
st
.
session_state
.
general_text_dict
[
'globalWarning'
])
st
.
error
(
str
(
st
.
session_state
.
pdfLanguages
))
st
.
error
(
st
.
session_state
.
general_text_dict
[
'advice'
])
return
if
len
(
languages
)
==
0
:
languages
.
append
(
l
)
...
...
@@ -193,24 +194,6 @@ def extractAllPDF(zipDir, zipFile):
if
".pdf"
in
info
.
filename
:
zipRef
.
extract
(
info
,
zipDir
)
def
convertAllPDF
(
zipFile
,
zipDir
):
pdfLanguages
=
{}
tsv
=
"authors
\t
source
\t
publication_year
\t
publication_month
\t
publication_day
\t
title
\t
abstract
\n
"
with
tempfile
.
TemporaryDirectory
()
as
pdfDir
:
st
.
session_state
.
dir
=
pdfDir
extractAllPDF
(
zipDir
,
zipFile
)
for
file
in
os
.
listdir
(
zipDir
):
if
".pdf"
in
file
:
tmp
,
languages
=
txtToTSV
(
file
,
zipDir
+
'/'
+
file
,
pdfDir
)
tsv
+=
"
\n
"
+
tmp
with
open
(
pdfDir
+
"/final.tsv"
,
"w"
,
encoding
=
'utf-8-sig'
)
as
file
:
file
.
write
(
tsv
)
shutil
.
make_archive
(
zipDir
+
"/PDFCompilation"
,
'zip'
,
pdfDir
)
# Tool Code End
...
...
@@ -306,7 +289,8 @@ if st.session_state.page == 3:
shutil
.
make_archive
(
st
.
session_state
.
zipDir
.
name
+
"/PDFCompilation"
,
'zip'
,
st
.
session_state
.
pdfDir
.
name
)
with
open
(
st
.
session_state
.
zipDir
.
name
+
"/PDFCompilation.zip"
,
'rb'
)
as
zip
:
st
.
write
(
st
.
session_state
.
warning
)
if
st
.
session_state
.
warning
!=
""
:
st
.
error
(
st
.
session_state
.
warning
)
detectMultiplePdfLanguages
()
st
.
write
(
st
.
session_state
.
general_text_dict
[
'new_file'
])
st
.
download_button
(
"PDFCompilation.zip"
,
...
...
Streamlit/pages/PDF_to_TXT.py
View file @
ff89fe15
...
...
@@ -3,6 +3,8 @@ Streamlit Application
Nicolas Atrax
"""
import
src.basic
as
tmp
from
lib.tika.tika
import
parser
import
streamlit
as
st
import
pandas
as
pd
import
chardet
...
...
@@ -11,8 +13,6 @@ from datetime import date
import
codecs
import
os
import
tempfile
from
lib.tika.tika
import
parser
import
src.basic
as
tmp
os
.
environ
[
'TIKA_SERVER_JAR'
]
=
'https://repo1.maven.org/maven2/org/apache/tika/tika-server/1.19/tika-server-1.19.jar'
tmp
.
base
(
"PDFtoTXT"
)
...
...
Streamlit/pages/TXT_to_TSV.py
View file @
ff89fe15
...
...
@@ -61,7 +61,7 @@ def detectMultipleLanguages(languages, fileName):
detected
+=
l
+
" : "
+
str
(
languages
[
l
])
+
"
%
"
valuable
.
append
(
l
)
if
len
(
valuable
)
>
1
:
st
.
session_state
.
warning
+=
st
.
session_state
.
general_text_dict
[
'warning'
]
+
\
st
.
session_state
.
warning
+=
st
.
session_state
.
general_text_dict
[
'warning'
]
+
"
\"
"
+
\
fileName
+
"
\"
!
\n
"
st
.
session_state
.
warning
+=
st
.
session_state
.
general_text_dict
[
'warning2'
]
+
\
detected
+
"
\n
"
...
...
@@ -145,6 +145,9 @@ form = st.form('api')
if
'page'
not
in
st
.
session_state
:
st
.
session_state
.
page
=
0
if
'warning'
not
in
st
.
session_state
:
st
.
session_state
.
warning
=
""
def
setSubmit
():
st
.
session_state
.
submit
=
True
...
...
@@ -193,4 +196,6 @@ if st.session_state.page == 1:
name
=
st
.
session_state
.
file
.
name
.
split
(
'.'
)[
0
]
+
'.tsv'
st
.
write
(
st
.
session_state
.
general_text_dict
[
'new_file'
])
st
.
session_state
.
submit
=
False
if
st
.
session_state
.
warning
!=
""
:
st
.
error
(
st
.
session_state
.
warning
)
st
.
download_button
(
name
,
txt
,
name
,
on_click
=
setPage
())
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment