Skip to content
Projects
Groups
Snippets
Help
Loading...
Help
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
G
GarganTools
Project
Project
Details
Activity
Releases
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
gargantext
GarganTools
Commits
cfc1e7cb
Commit
cfc1e7cb
authored
Jan 27, 2025
by
Marie FU
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
add conditions for method POST for validation of TSV contexts along with tests and OAS update
parent
313b7fcd
Changes
10
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
172 additions
and
108 deletions
+172
-108
contexts.py
gargantools/src/contexts.py
+24
-11
terms.py
gargantools/src/terms.py
+16
-1
contexts_utils.py
gargantools/utils/contexts_utils.py
+57
-0
utils.py
gargantools/utils/utils.py
+17
-65
openapi.yaml
openapi.yaml
+3
-1
test_contexts_utils.py
tests/test_contexts_utils.py
+12
-0
test_contexts_validate.py
tests/test_contexts_validate.py
+19
-18
incorrectDelimiter.csv
tests/test_files/incorrectDelimiter.csv
+3
-0
incorrectHeader.csv
tests/test_files/incorrectHeader.csv
+3
-0
test_utils.py
tests/test_utils.py
+18
-12
No files found.
gargantools/src/contexts.py
View file @
cfc1e7cb
import
json
from
flask
import
Blueprint
,
request
from
gargantools.utils.utils
import
check_fileContent
,
check_fileExtension
,
check_fileEncoding
from
gargantools.utils.contexts_utils
import
check_fileContent
from
gargantools.utils.utils
import
check_columnName
,
check_fileExtension
,
check_fileEncoding
,
get_fileContent
,
get_fileDelimiter
bp
=
Blueprint
(
"contexts"
,
__name__
,
url_prefix
=
"/contexts"
)
COLUMN_NAMES
=
[
"Publication Day"
,
"Publication Month"
,
"Publication Year"
,
"Authors"
,
"Title"
,
"Abstract"
,
"Source"
]
bp
=
Blueprint
(
"contexts"
,
__name__
,
url_prefix
=
"/contexts"
)
@
bp
.
get
(
''
)
def
tsvTemplate
():
return
json
.
dumps
({
"Publication Day"
:
1
,
"Publication Month"
:
1
,
"Publication Year"
:
1
,
"Authors"
:
"Some authors"
,
"Title"
:
"A title"
,
"Abstract"
:
"An abstract"
,
"Source"
:
"Some Source"
},
indent
=
3
)
@
bp
.
post
(
''
)
def
tsvValidat
e
():
def
tsvValidat
ion
():
if
'file'
not
in
request
.
files
:
print
(
request
.
files
)
return
"Bad request, missing file
\n
"
,
400
else
:
file
=
request
.
files
[
'file'
]
...
...
@@ -24,13 +25,25 @@ def tsvValidate():
file_encoding
=
check_fileEncoding
(
file
)
if
file_encoding
is
None
:
return
"Could not read the file
\n
"
,
422
elif
get_fileDelimiter
(
file
,
file_encoding
)
!=
'
\t
'
:
return
"File delimiter not found or Incorrect file delimiter, should be a tabulation
\n
"
,
422
else
:
state
,
problems
=
check_fileContent
(
file
,
file_encoding
)
if
state
:
return
"Correct file
\n
"
,
200
file_content
=
get_fileContent
(
file
,
'
\t
'
)
if
file_content
is
None
:
print
(
'here'
)
return
"Could not read the file
\n
"
,
422
else
:
if
problems
is
not
None
:
return
'Incorrect file - File is not compatible with GarganText
\n
'
,
422
header
=
file_content
.
keys
()
state
,
notFoundColumn
=
check_columnName
(
header
,
COLUMN_NAMES
)
if
not
state
:
return
f
"Some column names were not found {','.join(notFoundColumn)}
\n
"
,
422
else
:
return
'Incorrect file - File is malformed
\n
'
,
422
\ No newline at end of file
state
,
problems
=
check_fileContent
(
file_content
)
if
state
:
return
"Correct file
\n
"
,
200
else
:
if
problems
is
not
None
:
return
'Incorrect file - File is not compatible with GarganText
\n
'
,
422
else
:
return
'Unexpected error in file
\n
'
,
422
\ No newline at end of file
gargantools/src/terms.py
View file @
cfc1e7cb
import
json
from
flask
import
Blueprint
from
flask
import
Blueprint
,
request
from
gargantools.utils.utils
import
check_fileEncoding
,
check_fileExtension
bp
=
Blueprint
(
"terms"
,
__name__
,
url_prefix
=
"/terms"
)
...
...
@@ -7,3 +9,16 @@ bp = Blueprint("terms", __name__, url_prefix="/terms")
@
bp
.
get
(
''
)
def
termsTemplate
():
return
json
.
dumps
({
"status"
:
"MapTerm"
,
"label"
:
"A term"
},
indent
=
3
)
@
bp
.
post
(
''
)
def
termsValidation
():
if
'file'
not
in
request
.
files
:
return
"Bad request, missing file
\n
"
,
400
else
:
file
=
request
.
files
[
'file'
]
if
not
(
file
and
check_fileExtension
(
file
.
filename
,
{
"csv"
,
"tsv"
})):
return
"Incorrect file format or file format not found
\n
"
,
400
else
:
file_encoding
=
check_fileEncoding
(
file
)
if
file_encoding
is
None
:
return
"Could not read the file
\n
"
,
422
\ No newline at end of file
gargantools/utils/contexts_utils.py
0 → 100644
View file @
cfc1e7cb
import
csv
import
pandas
as
pd
import
petl
from
gargantools.utils.utils
import
check_date
,
check_unacceptedCharactersQuote
,
check_unacceptedCharactersTab
def
check_fileContent
(
fileContent
):
header
=
(
"Publication Day"
,
"Publication Month"
,
"Publication Year"
,
"Authors"
,
"Title"
,
"Abstract"
,
"Source"
)
constraints
=
[
dict
(
name
=
"pub_day"
,
field
=
"Publication Day"
,
test
=
int
,
assertion
=
lambda
x
:
0
<
x
<=
31
),
dict
(
name
=
"pub_month"
,
field
=
"Publication Month"
,
test
=
int
,
assertion
=
lambda
x
:
0
<
x
<=
12
),
dict
(
name
=
"pub_year"
,
field
=
"Publication Year"
,
test
=
int
,
assertion
=
lambda
x
:
x
>
0
),
dict
(
name
=
"date_format"
,
field
=
[
"Publication Day"
,
"Publication Month"
,
"Publication Year"
],
assertion
=
lambda
row
:
check_date
(
row
[
2
],
row
[
1
],
row
[
0
])
),
dict
(
name
=
"str_formatQuote"
,
field
=
"Authors"
,
assertion
=
lambda
x
:
check_unacceptedCharactersQuote
(
x
)
),
dict
(
name
=
"str_formatTab"
,
field
=
"Authors"
,
assertion
=
lambda
x
:
check_unacceptedCharactersTab
(
x
)
),
dict
(
name
=
"str_formatQuote"
,
field
=
"Title"
,
assertion
=
lambda
x
:
check_unacceptedCharactersQuote
(
x
)
),
dict
(
name
=
"str_formatTab"
,
field
=
"Title"
,
assertion
=
lambda
x
:
check_unacceptedCharactersTab
(
x
)
),
dict
(
name
=
"str_formatQuote"
,
field
=
"Abstract"
,
assertion
=
lambda
x
:
check_unacceptedCharactersQuote
(
x
)
),
dict
(
name
=
"str_formatTab"
,
field
=
"Abstract"
,
assertion
=
lambda
x
:
check_unacceptedCharactersTab
(
x
)
),
dict
(
name
=
"str_formatQuote"
,
field
=
"Source"
,
assertion
=
lambda
x
:
check_unacceptedCharactersQuote
(
x
)
),
dict
(
name
=
"str_formatTab"
,
field
=
"Source"
,
assertion
=
lambda
x
:
check_unacceptedCharactersTab
(
x
)
)
]
dataTable
=
fileContent
.
values
.
tolist
()
dataTable
.
insert
(
0
,
fileContent
.
columns
.
to_list
())
problemCells
=
petl
.
validate
(
dataTable
,
constraints
,
header
)
if
problemCells
.
len
()
>
1
:
return
False
,
problemCells
else
:
return
True
,
None
gargantools/utils/utils.py
View file @
cfc1e7cb
import
csv
import
datetime
import
chardet
import
petl
as
etl
import
pandas
as
pd
ALLOWED_ENCODING
=
{
"utf-8"
,
"utf-8-sig"
,
"ascii"
}
...
...
@@ -37,17 +35,30 @@ def check_fileEncoding(file):
def
get_fileName
(
filename
):
return
filename
.
rsplit
(
'.'
,
1
)[:
-
1
][
0
]
def
check_columnName
(
list_columnNames
):
def
get_fileDelimiter
(
file
,
file_encoding
):
fileDelimiter
=
csv
.
Sniffer
()
.
sniff
(
file
.
read
()
.
decode
(
file_encoding
))
.
delimiter
file
.
seek
(
0
)
return
fileDelimiter
def
get_fileContent
(
file
,
fileDelimiter
):
try
:
fileContent
=
pd
.
read_csv
(
file
,
sep
=
fileDelimiter
)
except
Exception
:
return
None
return
fileContent
def
check_columnName
(
list_columnNames
,
correct_columnNames
):
if
len
(
list_columnNames
)
>
7
:
return
False
,
[]
else
:
correct_columnNames
=
[
"Publication Day"
,
"Publication Month"
,
"Publication Year"
,
"Authors"
,
"Title"
,
"Abstract"
,
"Source"
]
notFound_columnNames
=
correct_columnNames
notFound_columnNames
=
correct_columnNames
.
copy
()
for
colummnName
in
list_columnNames
:
if
colummnName
in
correct_columnNames
:
notFound_columnNames
.
remove
(
colummnName
)
if
len
(
notFound_columnNames
)
!=
0
:
return
False
,
notFound_columnNames
else
:
...
...
@@ -65,63 +76,4 @@ def check_unacceptedCharactersQuote(cell):
def
check_unacceptedCharactersTab
(
cell
):
return
(
' '
in
cell
)
==
False
def
check_fileContent
(
file
,
file_encoding
):
header
=
(
"Publication Day"
,
"Publication Month"
,
"Publication Year"
,
"Authors"
,
"Title"
,
"Abstract"
,
"Source"
)
constraints
=
[
dict
(
name
=
"pub_day"
,
field
=
"Publication Day"
,
test
=
int
,
assertion
=
lambda
x
:
0
<
x
<=
31
),
dict
(
name
=
"pub_month"
,
field
=
"Publication Month"
,
test
=
int
,
assertion
=
lambda
x
:
0
<
x
<=
12
),
dict
(
name
=
"pub_year"
,
field
=
"Publication Year"
,
test
=
int
,
assertion
=
lambda
x
:
x
>
0
),
dict
(
name
=
"date_format"
,
field
=
[
"Publication Day"
,
"Publication Month"
,
"Publication Year"
],
assertion
=
lambda
row
:
check_date
(
row
[
2
],
row
[
1
],
row
[
0
])
),
dict
(
name
=
"str_formatQuote"
,
field
=
"Authors"
,
assertion
=
lambda
x
:
check_unacceptedCharactersQuote
(
x
)
),
dict
(
name
=
"str_formatTab"
,
field
=
"Authors"
,
assertion
=
lambda
x
:
check_unacceptedCharactersTab
(
x
)
),
dict
(
name
=
"str_formatQuote"
,
field
=
"Title"
,
assertion
=
lambda
x
:
check_unacceptedCharactersQuote
(
x
)
),
dict
(
name
=
"str_formatTab"
,
field
=
"Title"
,
assertion
=
lambda
x
:
check_unacceptedCharactersTab
(
x
)
),
dict
(
name
=
"str_formatQuote"
,
field
=
"Abstract"
,
assertion
=
lambda
x
:
check_unacceptedCharactersQuote
(
x
)
),
dict
(
name
=
"str_formatTab"
,
field
=
"Abstract"
,
assertion
=
lambda
x
:
check_unacceptedCharactersTab
(
x
)
),
dict
(
name
=
"str_formatQuote"
,
field
=
"Source"
,
assertion
=
lambda
x
:
check_unacceptedCharactersQuote
(
x
)
),
dict
(
name
=
"str_formatTab"
,
field
=
"Source"
,
assertion
=
lambda
x
:
check_unacceptedCharactersTab
(
x
)
)
]
fileDelimiter
=
csv
.
Sniffer
()
.
sniff
(
file
.
read
()
.
decode
(
file_encoding
))
.
delimiter
file
.
seek
(
0
)
try
:
fileContent
=
pd
.
read_csv
(
file
,
sep
=
fileDelimiter
)
except
Exception
:
return
False
,
None
dataTable
=
fileContent
.
values
.
tolist
()
dataTable
.
insert
(
0
,
fileContent
.
columns
.
to_list
())
problemCells
=
etl
.
validate
(
dataTable
,
constraints
,
header
)
if
problemCells
.
len
()
>
1
:
return
False
,
problemCells
else
:
return
True
,
None
\ No newline at end of file
openapi.yaml
View file @
cfc1e7cb
...
...
@@ -83,7 +83,9 @@ paths:
example
:
-
Could not read file
-
Incorrect file - File is not compatible with GarganText
-
Incorrect file - File is malformed
-
File delimiter not found or Incorrect file delimiter, should be a tabulation
-
Some column names were not found ...
-
Unexpected error in file
'
500'
:
description
:
Unexpected Error
/contexts/{from}
:
...
...
tests/test_contexts_utils.py
0 → 100644
View file @
cfc1e7cb
from
io
import
BytesIO
import
pytest
from
gargantools.utils.contexts_utils
import
check_fileContent
from
gargantools.utils.utils
import
get_fileContent
@
pytest
.
mark
.
parametrize
(
"test_file_copy"
,
[
"correct.csv"
],
indirect
=
True
)
def
test_check_fileContent
(
test_file_copy
):
with
open
(
test_file_copy
,
"r"
)
as
f
:
f_data
=
BytesIO
(
f
.
read
()
.
encode
(
"utf-8"
))
assert
check_fileContent
(
get_fileContent
(
f_data
,
'
\t
'
))
==
(
True
,
None
)
tests/test_contexts_validate.py
View file @
cfc1e7cb
...
...
@@ -10,30 +10,31 @@ def test_missingFile(client):
assert
response
.
data
==
b
"Bad request, missing file
\n
"
assert
response
.
status_code
==
400
@
pytest
.
mark
.
parametrize
(
"test_file_copy"
,
[
"incorrectExtention"
],
indirect
=
True
)
def
test_fileExtensionError
(
client
,
test_file_copy
):
with
open
(
test_file_copy
,
"rb"
)
as
f
:
f_data
=
BytesIO
(
f
.
read
())
file_storage
=
FileStorage
(
f_data
,
filename
=
test_file_copy
)
response
=
client
.
post
(
'/contexts'
,
data
=
{
'file'
:
file_storage
},
content_type
=
'multipart/form-data'
)
assert
response
.
data
==
b
"Incorrect file format or file format not found
\n
"
assert
response
.
status_code
==
400
def
test_fileEncodingError
(
client
):
# TODO: write test
return
@
pytest
.
mark
.
parametrize
(
"test_file_copy"
,
[
"incorrect.csv"
,
"malformed.csv"
],
indirect
=
True
)
@
pytest
.
mark
.
parametrize
(
"test_file_copy"
,
[
"incorrect.csv"
,
"malformed.csv"
,
"incorrectDelimiter.csv"
,
"incorrectHeader.csv"
,
"incorrectExtention"
],
indirect
=
True
)
def
test_incorrectFile
(
client
,
test_file_copy
):
with
open
(
test_file_copy
,
"rb"
)
as
f
:
f_data
=
BytesIO
(
f
.
read
())
file_storage
=
FileStorage
(
f_data
,
filename
=
test_file_copy
)
response
=
client
.
post
(
'/contexts'
,
data
=
{
'file'
:
file_storage
},
content_type
=
'multipart/form-data'
)
assert
response
.
status_code
==
422
print
(
test_file_copy
)
if
"incorrect.csv"
in
test_file_copy
:
assert
response
.
data
==
b
'Incorrect file - File is not compatible with GarganText
\n
'
elif
"malformed.csv"
in
test_file_copy
:
assert
response
.
data
==
b
"Could not read the file
\n
"
elif
"incorrectDelimiter.csv"
in
test_file_copy
:
assert
response
.
data
==
b
"File delimiter not found or Incorrect file delimiter, should be a tabulation
\n
"
elif
"incorrectHeader.csv"
in
test_file_copy
:
assert
b
"Some column names were not found"
in
response
.
data
elif
"incorrectExtention"
in
test_file_copy
:
assert
response
.
data
==
b
"Incorrect file format or file format not found
\n
"
assert
response
.
status_code
==
400
else
:
assert
False
if
"incorrectExtention"
not
in
test_file_copy
:
assert
response
.
status_code
==
422
@
pytest
.
mark
.
parametrize
(
"test_file_copy"
,
[
"correct.csv"
],
indirect
=
True
)
def
test_correctFile
(
client
,
test_file_copy
):
...
...
tests/test_files/incorrectDelimiter.csv
0 → 100644
View file @
cfc1e7cb
Publication Day,Publication Month,Publication Year,Authors,Title,Abstract,Source
31,12,1,an author,a ,title,an abstract,a source
1,1,1,an author,"a """"title""""","an """"""""abstract""""""""",a source
tests/test_files/incorrectHeader.csv
0 → 100644
View file @
cfc1e7cb
Publication Month Publication Year Authors Title Abstract Source
12 1 an author a ,title an abstract a source
1 1 an author "a """"title""""" "an """"""""abstract""""""""" a source
tests/test_utils.py
View file @
cfc1e7cb
from
io
import
BytesIO
import
os
import
pytest
from
gargantools.utils.contexts_utils
import
check_fileContent
from
gargantools.utils.utils
import
*
TEST_FILES_DIR
=
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
'test_files'
)
def
test_check_fileExtension
():
list_trueFile
=
[
'a.csv'
,
'a.tsv'
,
'a.pdf'
,
'a.html'
,
'a.txt'
,
'a.b.c.csv'
]
list_falseFile
=
[
'a'
,
'a.c'
]
...
...
@@ -33,26 +33,32 @@ def test_get_fileName():
assert
get_fileName
(
"afilename.csv"
)
==
"afilename"
assert
get_fileName
(
"afile.name.csv"
)
==
"afile.name"
@
pytest
.
mark
.
parametrize
(
"test_file_copy"
,
[
"correct.csv"
],
indirect
=
True
)
def
test_get_fileDelimiter
(
test_file_copy
):
with
open
(
test_file_copy
,
"r"
)
as
f
:
f_data
=
BytesIO
(
f
.
read
()
.
encode
(
"utf-8"
))
assert
get_fileDelimiter
(
f_data
,
"utf-8"
)
==
"
\t
"
@
pytest
.
mark
.
parametrize
(
"test_file_copy"
,
[
"correct.csv"
],
indirect
=
True
)
def
test_get_fileContent
(
test_file_copy
):
with
open
(
test_file_copy
,
"r"
)
as
f
:
f_data
=
BytesIO
(
f
.
read
()
.
encode
(
"utf-8"
))
assert
get_fileContent
(
f_data
,
"
\t
"
)
is
not
None
def
test_check_columnName
():
correct_column
=
[
"Publication Day"
,
"Publication Month"
,
"Publication Year"
,
"Authors"
,
"Title"
,
"Abstract"
,
"Source"
]
list_trueColumnName
=
[
"Publication Day"
,
"Publication Month"
,
"Publication Year"
,
"Authors"
,
"Title"
,
"Abstract"
,
"Source"
]
list_falseColumnName
=
[
"Publication day"
,
"Publication Month"
,
" Year"
,
"Authors"
,
"Title"
,
"Abstract"
,
"Source"
]
list_falseColumnName_more
=
[
"Publication Day"
,
"Publication Month"
,
"Publication Year"
,
"Authors"
,
"Title"
,
"Abstract"
,
"Source"
,
"azerty"
]
list_falseColumnName_less
=
[
"Publication Day"
,
"Publication Year"
,
"Authors"
,
"Title"
]
assert
check_columnName
(
list_trueColumnName
)
==
(
True
,
[])
assert
check_columnName
(
list_falseColumnName
)
==
(
False
,
[
"Publication Day"
,
"Publication Year"
])
assert
check_columnName
(
list_falseColumnName_more
)
==
(
False
,
[])
assert
check_columnName
(
list_falseColumnName_less
)
==
(
False
,
[
"Publication Month"
,
"Abstract"
,
"Source"
])
assert
check_columnName
(
list_trueColumnName
,
correct_column
)
==
(
True
,
[])
assert
check_columnName
(
list_falseColumnName
,
correct_column
)
==
(
False
,
[
"Publication Day"
,
"Publication Year"
])
assert
check_columnName
(
list_falseColumnName_more
,
correct_column
)
==
(
False
,
[])
assert
check_columnName
(
list_falseColumnName_less
,
correct_column
)
==
(
False
,
[
"Publication Month"
,
"Abstract"
,
"Source"
])
def
test_check_date
():
assert
check_date
(
1
,
1
,
1
)
assert
not
check_date
(
0
,
1
,
1
)
assert
not
check_date
(
2000
,
2
,
30
)
assert
not
check_date
(
2000
,
13
,
1
)
@
pytest
.
mark
.
parametrize
(
"test_file_copy"
,
[
"correct.csv"
],
indirect
=
True
)
def
test_check_fileContent
(
test_file_copy
):
with
open
(
test_file_copy
,
"r"
)
as
f
:
f_data
=
BytesIO
(
f
.
read
()
.
encode
(
"utf-8"
))
assert
check_fileContent
(
f_data
,
"utf-8"
)
==
(
True
,
None
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment