Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
G
GarganTexternal tools
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
1
Merge Requests
1
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Anne-Laure Thomas Derepas
GarganTexternal tools
Commits
b5fe01d8
Commit
b5fe01d8
authored
Jun 04, 2024
by
Marie FU
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
enable user to convert all files from a directory for API and minor correction for Streamlit
parent
369bdd93
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
133 additions
and
72 deletions
+133
-72
text_CleanCSVtoTSV.csv
Streamlit/lang/text_CleanCSVtoTSV.csv
+3
-0
Clean_CSV_to_TSV.py
Streamlit/pages/Clean_CSV_to_TSV.py
+55
-44
PDF_to_TSV.py
Streamlit/pages/PDF_to_TSV.py
+4
-6
.placeholder
apiBookToTSV/book/.placeholder
+0
-0
BookToTSVAPI.py
apiBookToTSV/src/BookToTSVAPI.py
+71
-22
No files found.
Streamlit/lang/text_CleanCSVtoTSV.csv
View file @
b5fe01d8
...
@@ -49,3 +49,6 @@ en,correct_file,"The given file is correct."
...
@@ -49,3 +49,6 @@ en,correct_file,"The given file is correct."
fr,new_file,"Télécharger le fichier TSV :"
fr,new_file,"Télécharger le fichier TSV :"
en,new_file,"Download the TSV file : "
en,new_file,"Download the TSV file : "
fr,err_detect_encoding,"L'encodage n'a pas pu être détecté"
en,err_detect_encoding,"Encoding could not be detected"
Streamlit/pages/Clean_CSV_to_TSV.py
View file @
b5fe01d8
...
@@ -6,14 +6,16 @@ Marie FU
...
@@ -6,14 +6,16 @@ Marie FU
from
io
import
StringIO
from
io
import
StringIO
import
typing
import
typing
import
chardet
import
streamlit
as
st
import
streamlit
as
st
import
csv
import
csv
import
re
import
re
import
codecs
import
codecs
import
src.basic
as
tmp
import
src.basic
as
tmp
from
streamlit.errors
import
StreamlitAPIException
# Define constant for file encoding supported (name in lower case)
# Define constant for file encoding supported (name in lower case)
FILE_ENCODING
=
"utf-8"
FILE_ENCODING
=
[
"utf-8"
,
"utf-8-sig"
,
"ascii"
]
# Define constant for GarganText TSV columns
# Define constant for GarganText TSV columns
TSV_COLUMNS
=
[
"Publication Day"
,
"Publication Month"
,
"Publication Year"
,
"Authors"
,
"Title"
,
"Source"
,
"Abstract"
]
TSV_COLUMNS
=
[
"Publication Day"
,
"Publication Month"
,
"Publication Year"
,
"Authors"
,
"Title"
,
"Source"
,
"Abstract"
]
...
@@ -35,15 +37,19 @@ def checkEncoding() -> bool:
...
@@ -35,15 +37,19 @@ def checkEncoding() -> bool:
(Boolean) : True if encoded correctly, False otherwise
(Boolean) : True if encoded correctly, False otherwise
Authors:
Authors:
Nicolas Atrax
Marie FU
Marie FU
"""
"""
content
=
STATE
.
file
.
read
()
content
=
STATE
.
file
.
read
()
try
:
STATE
.
encoding
=
chardet
.
detect
(
content
)[
"encoding"
]
content
.
decode
(
FILE_ENCODING
)
if
STATE
.
encoding
is
not
None
:
if
STATE
.
encoding
.
lower
()
not
in
FILE_ENCODING
:
STATE
.
errMessageLog
+=
st
.
session_state
.
general_text_dict
[
'err_file_encoding'
]
return
False
return
True
return
True
e
xcept
UnicodeDecodeError
:
e
lse
:
STATE
.
errMessageLog
+=
st
.
session_state
.
general_text_dict
[
'err_
file
_encoding'
]
STATE
.
errMessageLog
+=
st
.
session_state
.
general_text_dict
[
'err_
detect
_encoding'
]
return
False
return
False
...
@@ -64,7 +70,7 @@ def getSeparator() -> typing.Union[str,None]:
...
@@ -64,7 +70,7 @@ def getSeparator() -> typing.Union[str,None]:
"""
"""
toStartOfFile
()
toStartOfFile
()
line
=
STATE
.
file
.
readline
()
.
decode
(
'utf-8'
)
line
=
STATE
.
file
.
readline
()
.
decode
(
STATE
.
encoding
)
if
','
in
line
:
if
','
in
line
:
if
'
\t
'
in
line
or
';'
in
line
:
if
'
\t
'
in
line
or
';'
in
line
:
...
@@ -246,7 +252,7 @@ def getColumnsNames(separator : str) -> typing.Union[bool,None]:
...
@@ -246,7 +252,7 @@ def getColumnsNames(separator : str) -> typing.Union[bool,None]:
"""
"""
registeredNames
=
[]
registeredNames
=
[]
line
=
StringIO
(
STATE
.
file
.
getvalue
()
.
decode
(
"utf-8"
))
.
read
()
.
split
(
"
\n
"
)[
0
]
.
split
(
separator
)
line
=
StringIO
(
STATE
.
file
.
getvalue
()
.
decode
(
STATE
.
encoding
))
.
read
()
.
split
(
"
\n
"
)[
0
]
.
split
(
separator
)
othersColumns
=
[]
othersColumns
=
[]
for
name
in
line
:
for
name
in
line
:
registeredNames
,
otherColumns
=
checkColumnNames
(
name
,
registeredNames
,
othersColumns
)
registeredNames
,
otherColumns
=
checkColumnNames
(
name
,
registeredNames
,
othersColumns
)
...
@@ -347,11 +353,11 @@ def getContent(separator : str) -> None:
...
@@ -347,11 +353,11 @@ def getContent(separator : str) -> None:
csvLine
=
2
csvLine
=
2
reader
=
csv
.
DictReader
(
codecs
.
iterdecode
(
STATE
.
file
,
'utf-8'
),
delimiter
=
separator
)
reader
=
csv
.
DictReader
(
codecs
.
iterdecode
(
STATE
.
file
,
STATE
.
encoding
),
delimiter
=
separator
)
for
row
in
reader
:
for
row
in
reader
:
for
name
,
value
in
row
.
items
():
for
name
,
value
in
row
.
items
():
if
name
in
STATE
.
columnMap
.
keys
()
and
value
is
not
None
:
if
name
in
STATE
.
columnMap
.
keys
()
and
value
is
not
None
:
value
=
value
.
replace
(
"
\"
"
,
"”"
)
value
=
value
.
replace
(
"
\"
"
,
"”"
)
.
replace
(
"
\n
"
,
" "
)
checkMissing
(
lowerName
(
name
),
value
,
csvLine
)
checkMissing
(
lowerName
(
name
),
value
,
csvLine
)
csvLine
+=
1
csvLine
+=
1
STATE
.
fileData
=
STATE
.
fileData
[:
-
1
]
+
"
\n
"
STATE
.
fileData
=
STATE
.
fileData
[:
-
1
]
+
"
\n
"
...
@@ -376,7 +382,11 @@ def show_download_button() -> None:
...
@@ -376,7 +382,11 @@ def show_download_button() -> None:
Marie FU
Marie FU
"""
"""
st
.
download_button
(
STATE
.
newFileName
,
STATE
.
fileData
,
STATE
.
newFileName
)
try
:
st
.
download_button
(
STATE
.
newFileName
,
STATE
.
fileData
,
STATE
.
newFileName
)
except
StreamlitAPIException
:
st
.
write
(
"duplicate file"
)
print
(
"here"
)
return
return
...
@@ -411,7 +421,7 @@ def checkNewFileName() -> bool :
...
@@ -411,7 +421,7 @@ def checkNewFileName() -> bool :
STATE
.
errMessageLog
+=
st
.
session_state
.
general_text_dict
[
'err_file_name'
]
STATE
.
errMessageLog
+=
st
.
session_state
.
general_text_dict
[
'err_file_name'
]
return
False
return
False
else
:
else
:
STATE
.
newFileName
=
"File
.tsv"
STATE
.
newFileName
=
STATE
.
file
.
name
.
split
(
"."
)[
0
]
+
"
.tsv"
return
True
return
True
@
st
.
experimental_fragment
@
st
.
experimental_fragment
...
@@ -430,43 +440,44 @@ st.write(st.session_state.general_text_dict['description'])
...
@@ -430,43 +440,44 @@ st.write(st.session_state.general_text_dict['description'])
STATE
.
newFileName
=
st
.
text_input
(
label
=
st
.
session_state
.
general_text_dict
[
'file_name_input'
],
placeholder
=
"File.tsv"
)
STATE
.
newFileName
=
st
.
text_input
(
label
=
st
.
session_state
.
general_text_dict
[
'file_name_input'
],
placeholder
=
"File.tsv"
)
st
.
file_uploader
(
st
.
session_state
.
general_text_dict
[
'file'
],
type
=
[
"tsv"
,
"csv"
],
key
=
"file
"
)
st
.
file_uploader
(
st
.
session_state
.
general_text_dict
[
'file'
],
type
=
[
"tsv"
,
"csv"
],
key
=
"file
_"
,
accept_multiple_files
=
True
)
# checking if a file is uploaded
# checking if a file is uploaded
if
STATE
.
file
is
not
None
:
if
STATE
.
file_
is
not
None
:
for
f
in
STATE
.
file_
:
# first utilisation of errMessageLog, contain the log of every error message encountered
STATE
.
file
=
f
STATE
.
errMessageLog
=
""
STATE
.
newFileName
=
None
if
not
checkNewFileName
()
or
not
checkEncoding
()
or
getSeparator
()
is
None
:
errDisplay
()
# first utilisation of errMessageLog, contain the log of every error message encountered
else
:
STATE
.
errMessageLog
=
""
# set file pointer to start of file, will be reset each time between each file operation
if
not
checkNewFileName
()
or
not
checkEncoding
()
or
getSeparator
()
is
None
:
toStartOfFile
()
errDisplay
()
else
:
separator
=
getSeparator
()
# set file pointer to start of file, will be reset each time between each file operation
toStartOfFile
()
STATE
.
columnMap
=
{}
if
getColumnsNames
(
separator
)
:
# type: ignore
toStartOfFile
()
toStartOfFile
()
STATE
.
fileData
=
""
separator
=
getSeparator
()
addColumnsNamestoTSV
()
STATE
.
contentProblem
=
False
getContent
(
separator
)
# type: ignore
toStartOfFile
()
toStartOfFile
()
if
not
STATE
.
contentProblem
:
STATE
.
content
=
""
STATE
.
columnMap
=
{}
STATE
.
content
=
STATE
.
file
.
read
()
.
decode
(
'utf-8'
)
if
getColumnsNames
(
separator
)
:
# type: ignore
if
STATE
.
content
==
STATE
.
fileData
:
toStartOfFile
()
st
.
write
(
STATE
.
general_text_dict
[
'correct_file'
])
STATE
.
fileData
=
""
addColumnsNamestoTSV
()
STATE
.
contentProblem
=
False
getContent
(
separator
)
# type: ignore
toStartOfFile
()
if
not
STATE
.
contentProblem
:
STATE
.
content
=
""
STATE
.
content
=
STATE
.
file
.
read
()
.
decode
(
STATE
.
encoding
)
if
STATE
.
content
==
STATE
.
fileData
:
st
.
write
(
STATE
.
general_text_dict
[
'correct_file'
])
else
:
st
.
write
(
st
.
session_state
.
general_text_dict
[
'new_file'
])
show_download_button
()
else
:
else
:
st
.
write
(
st
.
session_state
.
general_text_dict
[
'new_file'
])
errDisplay
()
show_download_button
()
else
:
errDisplay
()
# STATE.file.close()
Streamlit/pages/PDF_to_TSV.py
View file @
b5fe01d8
...
@@ -7,10 +7,8 @@ import shutil
...
@@ -7,10 +7,8 @@ import shutil
import
zipfile
import
zipfile
import
tempfile
import
tempfile
import
os
import
os
import
codecs
from
datetime
import
date
from
datetime
import
date
import
re
import
re
import
chardet
import
pandas
as
pd
import
pandas
as
pd
import
streamlit
as
st
import
streamlit
as
st
import
lib.tika.tika
as
tika
import
lib.tika.tika
as
tika
...
@@ -178,14 +176,14 @@ def getInfo():
...
@@ -178,14 +176,14 @@ def getInfo():
def
pdfToTSV
(
fileName
,
fileAddress
,
pdfDir
):
def
pdfToTSV
(
fileName
,
fileAddress
,
pdfDir
):
st
.
session_state
.
page
=
1
st
.
session_state
.
page
=
1
author
,
title
,
watermark
=
getInfo
()
author
,
title
,
watermark
=
getInfo
()
tsv
=
"
authors
\t
source
\t
publication_year
\t
publication_month
\t
publication_day
\t
title
\t
a
bstract
\n
"
tsv
=
"
Authors
\t
Source
\t
Publication Year
\t
publication Month
\t
Publication_day
\t
Title
\t
A
bstract
\n
"
with
st
.
spinner
(
st
.
session_state
.
general_text_dict
[
'loading'
]):
with
st
.
spinner
(
st
.
session_state
.
general_text_dict
[
'loading'
]):
tsv
=
segmentAbstract
(
fileName
,
fileAddress
,
tsv
,
author
,
title
,
tsv
=
segmentAbstract
(
fileName
,
fileAddress
,
tsv
,
author
,
title
,
str
(
date
.
today
()
.
year
),
"1"
,
"1"
,
watermark
)
str
(
date
.
today
()
.
year
),
"1"
,
"1"
,
watermark
)
if
'/'
in
fileName
:
if
'/'
in
fileName
:
fileName
=
fileName
.
split
(
'/'
)[
1
]
fileName
=
fileName
.
split
(
'/'
)[
1
]
with
open
(
pdfDir
+
"/"
+
fileName
.
replace
(
".pdf"
,
"(pdf).tsv"
),
"w"
,
encoding
=
"utf-8-sig"
)
as
file
:
with
open
(
pdfDir
+
"/"
+
fileName
.
replace
(
".pdf"
,
"(pdf).tsv"
),
"w"
)
as
file
:
file
.
write
(
tsv
)
file
.
write
(
tsv
)
tsv
=
"
\n
"
.
join
(
tsv
.
split
(
"
\n
"
)[
1
:])
tsv
=
"
\n
"
.
join
(
tsv
.
split
(
"
\n
"
)[
1
:])
...
@@ -271,7 +269,7 @@ if st.session_state.page == 0:
...
@@ -271,7 +269,7 @@ if st.session_state.page == 0:
if
st
.
session_state
.
file
!=
None
:
if
st
.
session_state
.
file
!=
None
:
st
.
session_state
.
zipDir
=
tempfile
.
TemporaryDirectory
()
st
.
session_state
.
zipDir
=
tempfile
.
TemporaryDirectory
()
st
.
session_state
.
pdfDir
=
tempfile
.
TemporaryDirectory
()
st
.
session_state
.
pdfDir
=
tempfile
.
TemporaryDirectory
()
st
.
session_state
.
tsv
=
"
authors
\t
source
\t
publication_year
\t
publication_month
\t
publication_day
\t
title
\t
abstract
"
st
.
session_state
.
tsv
=
"
Authors
\t
Source
\t
Publication Year
\t
publication Month
\t
Publication_day
\t
Title
\t
Abstract
\n
"
st
.
session_state
.
page
=
1
st
.
session_state
.
page
=
1
extractAllPDF
(
st
.
session_state
.
zipDir
.
name
,
st
.
session_state
.
file
)
extractAllPDF
(
st
.
session_state
.
zipDir
.
name
,
st
.
session_state
.
file
)
st
.
session_state
.
len
=
len
(
st
.
session_state
.
len
=
len
(
...
@@ -302,7 +300,7 @@ if st.session_state.page == 1:
...
@@ -302,7 +300,7 @@ if st.session_state.page == 1:
askPDF
(
fileName
)
askPDF
(
fileName
)
if
st
.
session_state
.
page
==
3
:
if
st
.
session_state
.
page
==
3
:
with
open
(
st
.
session_state
.
pdfDir
.
name
+
"/PDFCompilation.tsv"
,
"w"
,
encoding
=
'utf-8-sig'
)
as
file
:
with
open
(
st
.
session_state
.
pdfDir
.
name
+
"/PDFCompilation.tsv"
,
"w"
)
as
file
:
file
.
write
(
st
.
session_state
.
tsv
)
file
.
write
(
st
.
session_state
.
tsv
)
shutil
.
make_archive
(
st
.
session_state
.
zipDir
.
name
+
shutil
.
make_archive
(
st
.
session_state
.
zipDir
.
name
+
"/PDFCompilation"
,
'zip'
,
st
.
session_state
.
pdfDir
.
name
)
"/PDFCompilation"
,
'zip'
,
st
.
session_state
.
pdfDir
.
name
)
...
...
apiBookToTSV/book/.placeholder
0 → 100644
View file @
b5fe01d8
apiBookToTSV/src/BookToTSVAPI.py
View file @
b5fe01d8
import
glob
import
os
from
pathlib
import
Path
from
bs4
import
BeautifulSoup
from
bs4
import
BeautifulSoup
from
flask
import
Flask
,
request
from
flask
import
Flask
,
request
from
grobid_client.grobid_client
import
GrobidClient
from
grobid_client.grobid_client
import
GrobidClient
from
numpy
import
number
app
=
Flask
(
__name__
)
def
requestGC
()
->
None
:
def
requestGC
()
->
None
:
...
@@ -84,7 +87,10 @@ def getDate(soup : BeautifulSoup) -> str:
...
@@ -84,7 +87,10 @@ def getDate(soup : BeautifulSoup) -> str:
date
=
soup
.
find
(
'date'
)
date
=
soup
.
find
(
'date'
)
if
date
is
not
None
:
if
date
is
not
None
:
if
date
is
str
:
if
date
is
str
:
res
=
date
[
'when'
]
try
:
res
=
date
[
'when'
]
except
TypeError
:
res
=
"1
\t
1
\t
1"
else
:
else
:
res
=
"1
\t
1
\t
1"
res
=
"1
\t
1
\t
1"
if
"-"
in
res
:
if
"-"
in
res
:
...
@@ -92,9 +98,9 @@ def getDate(soup : BeautifulSoup) -> str:
...
@@ -92,9 +98,9 @@ def getDate(soup : BeautifulSoup) -> str:
dateTab
=
res
.
split
(
"
\t
"
)
dateTab
=
res
.
split
(
"
\t
"
)
if
len
(
dateTab
)
==
2
:
if
len
(
dateTab
)
==
2
:
res
+=
"1"
res
+=
"
\t
1"
elif
len
(
dateTab
)
==
1
:
elif
len
(
dateTab
)
==
1
:
res
+=
"1
\t
1"
res
+=
"
\t
1
\t
1"
return
res
return
res
...
@@ -146,6 +152,11 @@ def getAuthors(soup : BeautifulSoup) -> str:
...
@@ -146,6 +152,11 @@ def getAuthors(soup : BeautifulSoup) -> str:
return
authors
.
strip
()
return
authors
.
strip
()
def
empty_content
()
->
None
:
for
f
in
glob
.
glob
(
"./book/file*"
):
os
.
remove
(
f
)
def
getData
(
soup
:
BeautifulSoup
,
title
:
str
|
None
,
date
:
str
,
authors
:
str
)
->
str
:
def
getData
(
soup
:
BeautifulSoup
,
title
:
str
|
None
,
date
:
str
,
authors
:
str
)
->
str
:
"""
"""
...
@@ -181,7 +192,7 @@ def getData(soup : BeautifulSoup, title : str|None, date : str, authors : str) -
...
@@ -181,7 +192,7 @@ def getData(soup : BeautifulSoup, title : str|None, date : str, authors : str) -
# Loop in paragraphs
# Loop in paragraphs
for
paragraph
in
paragraphList
:
for
paragraph
in
paragraphList
:
fileData
+=
date
+
"
\t
"
fileData
+=
date
+
"
\t
"
+
authors
+
"
\t
"
if
source
is
not
None
and
title
is
not
None
:
if
source
is
not
None
and
title
is
not
None
:
fileData
+=
title
+
", "
+
source
+
"
\t
"
fileData
+=
title
+
", "
+
source
+
"
\t
"
elif
source
is
None
and
title
is
not
None
:
elif
source
is
None
and
title
is
not
None
:
...
@@ -209,29 +220,67 @@ def getData(soup : BeautifulSoup, title : str|None, date : str, authors : str) -
...
@@ -209,29 +220,67 @@ def getData(soup : BeautifulSoup, title : str|None, date : str, authors : str) -
fileData
+=
"Title
\t
Abstract"
fileData
+=
"Title
\t
Abstract"
fileData
+=
"
\n
"
fileData
+=
"
\n
"
fileData
=
"Publication Year
\t
Publication Month
\t
Publication Day
\t
Source
\t
Title
\t
Abstract
\t
Authors
\n
"
+
fileData
.
replace
(
"
\n
"
,
"
\t
"
+
authors
+
"
\n
"
)
return
fileData
return
fileData
@
app
.
route
(
"/"
,
methods
=
[
'POST'
])
def
checkXMLFile
()
->
bool
:
def
getBookTSV
()
->
str
:
if
os
.
path
.
exists
(
"./book/file.pdf"
):
if
request
.
method
==
'POST'
:
return
True
# check if the post request has the file part
empty_content
()
if
'file'
not
in
request
.
files
:
return
False
return
"no file
\n
"
def
checkContent
(
data
:
str
)
->
str
:
res
=
""
for
line
in
data
.
split
(
"
\n
"
):
if
line
!=
""
:
if
not
line
[
0
]
.
isnumeric
():
res
=
res
[:
-
1
]
+
line
+
"
\n
"
else
:
res
+=
line
+
"
\n
"
return
res
.
replace
(
"
\"
"
,
"”"
)
def
create_app
():
app
=
Flask
(
__name__
)
@
app
.
route
(
"/getFile"
,
methods
=
[
'POST'
])
def
getBookTSV
()
->
str
:
if
request
.
method
==
'POST'
:
# check if the post request has the file part
if
'dir'
not
in
request
.
form
:
return
"no directory path, curl needs to be like -- curl -X POST -F dir='my_directory_path' url --
\n
"
elif
not
os
.
path
.
exists
(
request
.
form
[
"dir"
]):
print
(
request
.
form
[
"dir"
])
return
"directory does not exist, directory path should be absolute
\n
"
else
:
dirName
=
request
.
form
[
"dir"
]
files
=
Path
(
dirName
)
.
glob
(
'*.pdf'
)
allData
=
""
for
file
in
files
:
print
(
file
.
name
)
with
open
(
"book/file.pdf"
,
"wb"
)
as
f
:
f
.
write
(
file
.
read_bytes
())
requestGC
()
if
not
checkXMLFile
():
return
""
soup
=
getXMLContent
()
title
=
getBookTitle
(
soup
)
date
=
getDate
(
soup
)
file
=
request
.
files
[
'file'
]
authors
=
getAuthors
(
soup
)
with
open
(
"book/file.pdf"
,
"wb"
)
as
f
:
f
.
write
(
file
.
read
())
requestGC
()
soup
=
getXMLContent
(
)
allData
+=
getData
(
soup
,
title
,
date
,
authors
)
title
=
getBookTitle
(
soup
)
empty_content
(
)
date
=
getDate
(
soup
)
return
"Publication Year
\t
Publication Month
\t
Publication Day
\t
Authors
\t
Source
\t
Title
\t
Abstract
\n
"
+
checkContent
(
allData
)
return
""
authors
=
getAuthors
(
soup
)
return
app
return
getData
(
soup
,
title
,
date
,
authors
)
create_app
()
return
""
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment