Commit cfc1e7cb authored by Marie FU's avatar Marie FU

add conditions for method POST for validation of TSV contexts along with tests and OAS update

parent 313b7fcd
import json
from flask import Blueprint, request
from gargantools.utils.utils import check_fileContent, check_fileExtension, check_fileEncoding
from gargantools.utils.contexts_utils import check_fileContent
from gargantools.utils.utils import check_columnName, check_fileExtension, check_fileEncoding, get_fileContent, get_fileDelimiter
bp = Blueprint("contexts", __name__, url_prefix="/contexts")
COLUMN_NAMES = ["Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Abstract", "Source"]
bp = Blueprint("contexts", __name__, url_prefix="/contexts")
@bp.get('')
def tsvTemplate():
return json.dumps({"Publication Day": 1, "Publication Month": 1, "Publication Year": 1, "Authors": "Some authors", "Title": "A title", "Abstract": "An abstract", "Source": "Some Source" }, indent=3)
@bp.post('')
def tsvValidate():
def tsvValidation():
if 'file' not in request.files:
print(request.files)
return "Bad request, missing file\n", 400
else:
file = request.files['file']
......@@ -24,13 +25,25 @@ def tsvValidate():
file_encoding = check_fileEncoding(file)
if file_encoding is None:
return "Could not read the file\n", 422
elif get_fileDelimiter(file, file_encoding) != '\t':
return "File delimiter not found or Incorrect file delimiter, should be a tabulation\n", 422
else:
state, problems = check_fileContent(file, file_encoding)
if state:
return "Correct file\n", 200
file_content = get_fileContent(file, '\t')
if file_content is None:
print('here')
return "Could not read the file\n", 422
else:
if problems is not None:
return 'Incorrect file - File is not compatible with GarganText\n', 422
header = file_content.keys()
state, notFoundColumn = check_columnName(header, COLUMN_NAMES)
if not state:
return f"Some column names were not found {','.join(notFoundColumn)}\n", 422
else:
return 'Incorrect file - File is malformed\n', 422
\ No newline at end of file
state, problems = check_fileContent(file_content)
if state:
return "Correct file\n", 200
else:
if problems is not None:
return 'Incorrect file - File is not compatible with GarganText\n', 422
else:
return 'Unexpected error in file\n', 422
\ No newline at end of file
import json
from flask import Blueprint
from flask import Blueprint, request
from gargantools.utils.utils import check_fileEncoding, check_fileExtension
bp = Blueprint("terms", __name__, url_prefix="/terms")
......@@ -7,3 +9,16 @@ bp = Blueprint("terms", __name__, url_prefix="/terms")
@bp.get('')
def termsTemplate():
return json.dumps({"status": "MapTerm", "label": "A term"}, indent=3)
@bp.post('')
def termsValidation():
if 'file' not in request.files:
return "Bad request, missing file\n", 400
else:
file = request.files['file']
if not (file and check_fileExtension(file.filename, {"csv", "tsv"})):
return "Incorrect file format or file format not found\n", 400
else:
file_encoding = check_fileEncoding(file)
if file_encoding is None:
return "Could not read the file\n", 422
\ No newline at end of file
import csv
import pandas as pd
import petl
from gargantools.utils.utils import check_date, check_unacceptedCharactersQuote, check_unacceptedCharactersTab
def check_fileContent(fileContent):
header = ("Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Abstract", "Source")
constraints = [
dict(
name = "pub_day", field = "Publication Day", test = int, assertion = lambda x: 0 < x <= 31
),
dict(
name = "pub_month", field = "Publication Month", test = int, assertion = lambda x : 0 < x <= 12
),
dict(
name = "pub_year", field = "Publication Year", test = int, assertion = lambda x: x > 0
),
dict(
name = "date_format", field = ["Publication Day", "Publication Month", "Publication Year"], assertion = lambda row : check_date(row[2], row[1], row[0])
),
dict(
name = "str_formatQuote", field = "Authors", assertion = lambda x : check_unacceptedCharactersQuote(x)
),
dict(
name = "str_formatTab", field = "Authors", assertion = lambda x : check_unacceptedCharactersTab(x)
),
dict(
name = "str_formatQuote", field = "Title", assertion = lambda x : check_unacceptedCharactersQuote(x)
),
dict(
name = "str_formatTab", field = "Title", assertion = lambda x : check_unacceptedCharactersTab(x)
),
dict(
name = "str_formatQuote", field = "Abstract", assertion = lambda x : check_unacceptedCharactersQuote(x)
),
dict(
name = "str_formatTab", field = "Abstract", assertion = lambda x : check_unacceptedCharactersTab(x)
),
dict(
name = "str_formatQuote", field = "Source", assertion = lambda x : check_unacceptedCharactersQuote(x)
),
dict(
name = "str_formatTab", field = "Source", assertion = lambda x : check_unacceptedCharactersTab(x)
)
]
dataTable = fileContent.values.tolist()
dataTable.insert(0, fileContent.columns.to_list())
problemCells = petl.validate(dataTable, constraints, header)
if problemCells.len() > 1:
return False, problemCells
else:
return True, None
import csv
import datetime
import chardet
import petl as etl
import pandas as pd
ALLOWED_ENCODING = {"utf-8", "utf-8-sig", "ascii"}
......@@ -37,17 +35,30 @@ def check_fileEncoding(file):
def get_fileName(filename):
return filename.rsplit('.', 1)[:-1][0]
def check_columnName(list_columnNames):
def get_fileDelimiter(file, file_encoding):
fileDelimiter = csv.Sniffer().sniff(file.read().decode(file_encoding)).delimiter
file.seek(0)
return fileDelimiter
def get_fileContent(file, fileDelimiter):
try:
fileContent = pd.read_csv(file, sep=fileDelimiter)
except Exception:
return None
return fileContent
def check_columnName(list_columnNames, correct_columnNames):
if len(list_columnNames) > 7:
return False, []
else:
correct_columnNames = ["Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Abstract", "Source"]
notFound_columnNames = correct_columnNames
notFound_columnNames = correct_columnNames.copy()
for colummnName in list_columnNames:
if colummnName in correct_columnNames:
notFound_columnNames.remove(colummnName)
if len(notFound_columnNames) != 0:
return False, notFound_columnNames
else:
......@@ -65,63 +76,4 @@ def check_unacceptedCharactersQuote(cell):
def check_unacceptedCharactersTab(cell):
return (' ' in cell) == False
def check_fileContent(file, file_encoding):
header = ("Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Abstract", "Source")
constraints = [
dict(
name = "pub_day", field = "Publication Day", test = int, assertion = lambda x: 0 < x <= 31
),
dict(
name = "pub_month", field = "Publication Month", test = int, assertion = lambda x : 0 < x <= 12
),
dict(
name = "pub_year", field = "Publication Year", test = int, assertion = lambda x: x > 0
),
dict(
name = "date_format", field = ["Publication Day", "Publication Month", "Publication Year"], assertion = lambda row : check_date(row[2], row[1], row[0])
),
dict(
name = "str_formatQuote", field = "Authors", assertion = lambda x : check_unacceptedCharactersQuote(x)
),
dict(
name = "str_formatTab", field = "Authors", assertion = lambda x : check_unacceptedCharactersTab(x)
),
dict(
name = "str_formatQuote", field = "Title", assertion = lambda x : check_unacceptedCharactersQuote(x)
),
dict(
name = "str_formatTab", field = "Title", assertion = lambda x : check_unacceptedCharactersTab(x)
),
dict(
name = "str_formatQuote", field = "Abstract", assertion = lambda x : check_unacceptedCharactersQuote(x)
),
dict(
name = "str_formatTab", field = "Abstract", assertion = lambda x : check_unacceptedCharactersTab(x)
),
dict(
name = "str_formatQuote", field = "Source", assertion = lambda x : check_unacceptedCharactersQuote(x)
),
dict(
name = "str_formatTab", field = "Source", assertion = lambda x : check_unacceptedCharactersTab(x)
)
]
fileDelimiter = csv.Sniffer().sniff(file.read().decode(file_encoding)).delimiter
file.seek(0)
try:
fileContent = pd.read_csv(file, sep=fileDelimiter)
except Exception:
return False, None
dataTable = fileContent.values.tolist()
dataTable.insert(0, fileContent.columns.to_list())
problemCells = etl.validate(dataTable, constraints, header)
if problemCells.len() > 1:
return False, problemCells
else:
return True, None
\ No newline at end of file
......@@ -83,7 +83,9 @@ paths:
example:
- Could not read file
- Incorrect file - File is not compatible with GarganText
- Incorrect file - File is malformed
- File delimiter not found or Incorrect file delimiter, should be a tabulation
- Some column names were not found ...
- Unexpected error in file
'500':
description: Unexpected Error
/contexts/{from}:
......
from io import BytesIO
import pytest
from gargantools.utils.contexts_utils import check_fileContent
from gargantools.utils.utils import get_fileContent
@pytest.mark.parametrize("test_file_copy", ["correct.csv"], indirect=True)
def test_check_fileContent(test_file_copy):
with open(test_file_copy, "r") as f:
f_data = BytesIO(f.read().encode("utf-8"))
assert check_fileContent(get_fileContent(f_data, '\t')) == (True, None)
......@@ -10,30 +10,31 @@ def test_missingFile(client):
assert response.data == b"Bad request, missing file\n"
assert response.status_code == 400
@pytest.mark.parametrize("test_file_copy", ["incorrectExtention"], indirect=True)
def test_fileExtensionError(client, test_file_copy):
with open(test_file_copy, "rb") as f:
f_data = BytesIO(f.read())
file_storage = FileStorage(f_data, filename=test_file_copy)
response = client.post('/contexts', data={'file': file_storage}, content_type='multipart/form-data')
assert response.data == b"Incorrect file format or file format not found\n"
assert response.status_code == 400
def test_fileEncodingError(client):
# TODO: write test
return
@pytest.mark.parametrize("test_file_copy", ["incorrect.csv", "malformed.csv"], indirect=True)
@pytest.mark.parametrize("test_file_copy", ["incorrect.csv", "malformed.csv", "incorrectDelimiter.csv", "incorrectHeader.csv", "incorrectExtention"], indirect=True)
def test_incorrectFile(client, test_file_copy):
with open(test_file_copy, "rb") as f:
f_data = BytesIO(f.read())
file_storage = FileStorage(f_data, filename=test_file_copy)
response = client.post('/contexts', data={'file': file_storage}, content_type='multipart/form-data')
assert response.status_code == 422
print(test_file_copy)
if "incorrect.csv" in test_file_copy:
assert response.data == b'Incorrect file - File is not compatible with GarganText\n'
elif "malformed.csv" in test_file_copy:
assert response.data == b"Could not read the file\n"
elif "incorrectDelimiter.csv" in test_file_copy:
assert response.data == b"File delimiter not found or Incorrect file delimiter, should be a tabulation\n"
elif "incorrectHeader.csv" in test_file_copy:
assert b"Some column names were not found" in response.data
elif "incorrectExtention" in test_file_copy:
assert response.data == b"Incorrect file format or file format not found\n"
assert response.status_code == 400
else:
assert False
if "incorrectExtention" not in test_file_copy:
assert response.status_code == 422
@pytest.mark.parametrize("test_file_copy", ["correct.csv"], indirect=True)
def test_correctFile(client, test_file_copy):
......
Publication Day,Publication Month,Publication Year,Authors,Title,Abstract,Source
31,12,1,an author,a ,title,an abstract,a source
1,1,1,an author,"a """"title""""","an """"""""abstract""""""""",a source
Publication Month Publication Year Authors Title Abstract Source
12 1 an author a ,title an abstract a source
1 1 an author "a """"title""""" "an """"""""abstract""""""""" a source
from io import BytesIO
import os
import pytest
from gargantools.utils.contexts_utils import check_fileContent
from gargantools.utils.utils import *
TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), 'test_files')
def test_check_fileExtension():
list_trueFile = ['a.csv', 'a.tsv', 'a.pdf', 'a.html', 'a.txt', 'a.b.c.csv']
list_falseFile = ['a', 'a.c']
......@@ -33,26 +33,32 @@ def test_get_fileName():
assert get_fileName("afilename.csv") == "afilename"
assert get_fileName("afile.name.csv") == "afile.name"
@pytest.mark.parametrize("test_file_copy", ["correct.csv"], indirect=True)
def test_get_fileDelimiter(test_file_copy):
with open(test_file_copy, "r") as f:
f_data = BytesIO(f.read().encode("utf-8"))
assert get_fileDelimiter(f_data, "utf-8") == "\t"
@pytest.mark.parametrize("test_file_copy", ["correct.csv"], indirect=True)
def test_get_fileContent(test_file_copy):
with open(test_file_copy, "r") as f:
f_data = BytesIO(f.read().encode("utf-8"))
assert get_fileContent(f_data, "\t") is not None
def test_check_columnName():
correct_column = ["Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Abstract", "Source"]
list_trueColumnName = ["Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Abstract", "Source"]
list_falseColumnName = ["Publication day", "Publication Month", " Year", "Authors", "Title", "Abstract", "Source"]
list_falseColumnName_more = ["Publication Day", "Publication Month", "Publication Year", "Authors", "Title", "Abstract", "Source", "azerty"]
list_falseColumnName_less = ["Publication Day", "Publication Year", "Authors", "Title"]
assert check_columnName(list_trueColumnName) == (True, [])
assert check_columnName(list_falseColumnName) == (False, ["Publication Day", "Publication Year"])
assert check_columnName(list_falseColumnName_more) == (False, [])
assert check_columnName(list_falseColumnName_less) == (False, ["Publication Month", "Abstract", "Source"])
assert check_columnName(list_trueColumnName, correct_column) == (True, [])
assert check_columnName(list_falseColumnName, correct_column) == (False, ["Publication Day", "Publication Year"])
assert check_columnName(list_falseColumnName_more, correct_column) == (False, [])
assert check_columnName(list_falseColumnName_less, correct_column) == (False, ["Publication Month", "Abstract", "Source"])
def test_check_date():
assert check_date(1,1,1)
assert not check_date(0,1,1)
assert not check_date(2000,2,30)
assert not check_date(2000,13,1)
@pytest.mark.parametrize("test_file_copy", ["correct.csv"], indirect=True)
def test_check_fileContent(test_file_copy):
with open(test_file_copy, "r") as f:
f_data = BytesIO(f.read().encode("utf-8"))
assert check_fileContent(f_data, "utf-8") == (True, None)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment